138 files changed, 2090 insertions, 1351 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 1500743763b..8854170c642 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -226,6 +226,9 @@ add_definitions(
   -DCCL_NAMESPACE_END=}
 )
 
+if(WITH_CYCLES_DEBUG)
+  add_definitions(-DWITH_CYCLES_DEBUG)
+endif()
 if(WITH_CYCLES_STANDALONE_GUI)
   add_definitions(-DWITH_CYCLES_STANDALONE_GUI)
 endif()
@@ -334,7 +337,7 @@ else()
 endif()
 
 # Warnings
-if(CMAKE_COMPILER_IS_GNUCXX)
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_C_COMPILER_ID MATCHES "Clang")
   ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion")
   ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_double_promotion "-Werror=double-promotion")
   ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_no_error_unused_macros "-Wno-error=unused-macros")
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index 149967ad331..f0540486656 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -138,11 +138,6 @@ endif()
 
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
-# avoid link failure with clang 3.4 debug
-if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS '3.4')
-  string(APPEND CMAKE_CXX_FLAGS_DEBUG " -gline-tables-only")
-endif()
-
 add_dependencies(bf_intern_cycles bf_rna)
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${ADDON_FILES}" ${CYCLES_INSTALL_PATH})
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 19580914f19..a478dc22cd0 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -218,6 +218,12 @@ enum_denoising_prefilter = (
     ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
 )
 
+enum_direct_light_sampling_type = (
+    ('MULTIPLE_IMPORTANCE_SAMPLING', "Multiple Importance Sampling", "Multiple importance sampling is used to combine direct light contributions from next-event estimation and forward path tracing", 0),
+    ('FORWARD_PATH_TRACING', "Forward Path Tracing", "Direct light contributions are only sampled using forward path tracing", 1),
+    ('NEXT_EVENT_ESTIMATION', "Next-Event Estimation", "Direct light contributions are only sampled using next-event estimation", 2),
+)
+
 def update_render_passes(self, context):
     scene = context.scene
     view_layer = context.view_layer
@@ -325,6 +331,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=1024,
     )
 
+    sample_offset: IntProperty(
+        name="Sample Offset",
+        description="Number of samples to skip when starting render",
+        min=0, max=(1 << 24),
+        default=0,
+    )
+
     time_limit: FloatProperty(
         name="Time Limit",
         description="Limit the render time (excluding synchronization time)."
@@ -415,6 +428,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0,
     )
 
+    direct_light_sampling_type: EnumProperty(
+        name="Direct Light Sampling Type",
+        description="The type of strategy used for sampling direct light contributions",
+        items=enum_direct_light_sampling_type,
+        default='MULTIPLE_IMPORTANCE_SAMPLING',
+    )
+
     min_light_bounces: IntProperty(
         name="Min Light Bounces",
         description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 397823103c8..57741447608 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -290,6 +290,9 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
         col.prop(cscene, "sampling_pattern", text="Pattern")
 
+        col = layout.column(align=True)
+        col.prop(cscene, "sample_offset")
+
         layout.separator()
 
         heading = layout.column(align=True, heading="Scrambling Distance")
diff --git a/intern/cycles/blender/curves.cpp b/intern/cycles/blender/curves.cpp
index ffe0c553738..c96d01a8ffb 100644
--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -199,7 +199,7 @@ static bool ObtainCacheParticleUV(Hair *hair,
           b_mesh->uv_layers.begin(l);
 
           float2 uv = zero_float2();
-          if (b_mesh->uv_layers.length())
+          if (!b_mesh->uv_layers.empty())
             b_psys.uv_on_emitter(psmd, *b_pa, pa_no, uv_num, &uv.x);
           CData->curve_uv.push_back_slow(uv);
 
@@ -261,7 +261,7 @@ static bool ObtainCacheParticleVcol(Hair *hair,
           b_mesh->vertex_colors.begin(l);
 
           float4 vcol = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-          if (b_mesh->vertex_colors.length())
+          if (!b_mesh->vertex_colors.empty())
             b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x);
           CData->curve_vcol.push_back_slow(vcol);
 
diff --git a/intern/cycles/blender/display_driver.cpp b/intern/cycles/blender/display_driver.cpp
index d5f6d85251e..abf421983b3 100644
--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -334,7 +334,7 @@ bool BlenderDisplayDriver::update_begin(const Params &params,
 
   /* Update PBO dimensions if needed.
    *
-   * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in,
+   * NOTE: Allocate the PBO for the size which will fit the final render resolution (as in,
    * at a resolution divider 1. This was we don't need to recreate graphics interoperability
    * objects which are costly and which are tied to the specific underlying buffer size.
    * The downside of this approach is that when graphics interoperability is not used we are
diff --git a/intern/cycles/blender/mesh.cpp b/intern/cycles/blender/mesh.cpp
index b69bf88c213..bb17cfdcb45 100644
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -555,7 +555,7 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh,
 /* Create uv map attributes. */
 static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh)
 {
-  if (b_mesh.uv_layers.length() != 0) {
+  if (!b_mesh.uv_layers.empty()) {
     for (BL::MeshUVLoopLayer &l : b_mesh.uv_layers) {
       const bool active_render = l.active_render();
       AttributeStandard uv_std = (active_render) ? ATTR_STD_UV : ATTR_STD_NONE;
@@ -619,7 +619,7 @@ static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh)
 
 static void attr_create_subd_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivide_uvs)
 {
-  if (b_mesh.uv_layers.length() != 0) {
+  if (!b_mesh.uv_layers.empty()) {
     BL::Mesh::uv_layers_iterator l;
     int i = 0;
 
@@ -951,7 +951,7 @@ static void create_mesh(Scene *scene,
   N = attr_N->data_float3();
 
   /* create generated coordinates from undeformed coordinates */
-  const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.length() == 0) &&
+  const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.empty()) &&
                                     (mesh->need_attribute(scene, ATTR_STD_UV_TANGENT));
   if (mesh->need_attribute(scene, ATTR_STD_GENERATED) || need_default_tangent) {
     Attribute *attr = attributes.add(ATTR_STD_GENERATED);
diff --git a/intern/cycles/blender/sync.cpp b/intern/cycles/blender/sync.cpp
index 43625a2e479..949b7cb1b3b 100644
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -392,6 +392,12 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
     integrator->set_ao_bounces(0);
   }
 
+#ifdef WITH_CYCLES_DEBUG
+  DirectLightSamplingType direct_light_sampling_type = (DirectLightSamplingType)get_enum(
+      cscene, "direct_light_sampling_type", DIRECT_LIGHT_SAMPLING_NUM, DIRECT_LIGHT_SAMPLING_MIS);
+  integrator->set_direct_light_sampling_type(direct_light_sampling_type);
+#endif
+
   const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background);
   integrator->set_use_denoise(denoise_params.use);
 
@@ -835,18 +841,25 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   /* samples */
   int samples = get_int(cscene, "samples");
   int preview_samples = get_int(cscene, "preview_samples");
+  int sample_offset = get_int(cscene, "sample_offset");
 
   if (background) {
     params.samples = samples;
+    params.sample_offset = sample_offset;
   }
   else {
     params.samples = preview_samples;
-    if (params.samples == 0)
+    if (params.samples == 0) {
       params.samples = INT_MAX;
+    }
+    params.sample_offset = 0;
   }
 
+  /* Clamp sample offset. */
+  params.sample_offset = clamp(params.sample_offset, 0, Integrator::MAX_SAMPLES);
+
   /* Clamp samples. */
-  params.samples = min(params.samples, Integrator::MAX_SAMPLES);
+  params.samples = clamp(params.samples, 0, Integrator::MAX_SAMPLES - params.sample_offset);
 
   /* Viewport Performance */
   params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
@@ -865,7 +878,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* Time limit. */
   if (background) {
-    params.time_limit = get_float(cscene, "time_limit");
+    params.time_limit = (double)get_float(cscene, "time_limit");
   }
   else {
     /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is
diff --git a/intern/cycles/blender/util.h b/intern/cycles/blender/util.h
index 33fd2c416c8..be36bcdaaa8 100644
--- a/intern/cycles/blender/util.h
+++ b/intern/cycles/blender/util.h
@@ -303,7 +303,7 @@ static inline string image_user_file_path(BL::ImageUser &iuser,
   string filepath_str = string(filepath);
   if (load_tiled && ima.source() == BL::Image::source_TILED) {
     string udim;
-    if (ima.tiles.length() > 0) {
+    if (!ima.tiles.empty()) {
       udim = to_string(ima.tiles[0].number());
     }
     string_replace(filepath_str, udim, "<UDIM>");
@@ -647,7 +647,7 @@ static inline Mesh::SubdivisionType object_subdivision_type(BL::Object &b_ob,
 {
   PointerRNA cobj = RNA_pointer_get(&b_ob.ptr, "cycles");
 
-  if (cobj.data && b_ob.modifiers.length() > 0 && experimental) {
+  if (cobj.data && !b_ob.modifiers.empty() && experimental) {
     BL::Modifier mod = b_ob.modifiers[b_ob.modifiers.length() - 1];
     bool enabled = preview ? mod.show_viewport() : mod.show_render();
 
diff --git a/intern/cycles/bvh/embree.cpp b/intern/cycles/bvh/embree.cpp
index 944a84ce0da..b54b38f2798 100644
--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -303,7 +303,7 @@ static void rtc_error_func(void *, enum RTCError, const char *str)
   VLOG(1) << str;
 }
 
-static double progress_start_time = 0.0f;
+static double progress_start_time = 0.0;
 
 static bool rtc_progress_func(void *user_ptr, const double n)
 {
diff --git a/intern/cycles/bvh/node.cpp b/intern/cycles/bvh/node.cpp
index d3a665adfe7..60b0843bde2 100644
--- a/intern/cycles/bvh/node.cpp
+++ b/intern/cycles/bvh/node.cpp
@@ -153,7 +153,7 @@ void BVHNode::update_time()
 namespace {
 
 struct DumpTraversalContext {
-  /* Descriptor of wile where writing is happening. */
+  /* Descriptor of while where writing is happening. */
   FILE *stream;
   /* Unique identifier of the node current. */
   int id;
diff --git a/intern/cycles/bvh/node.h b/intern/cycles/bvh/node.h
index d5de9e062fc..1fd9efdb75e 100644
--- a/intern/cycles/bvh/node.h
+++ b/intern/cycles/bvh/node.h
@@ -178,7 +178,7 @@ class InnerNode : public BVHNode {
     reset_unused_children();
   }
 
-  /* NOTE: This function is only used during binary BVH builder, and it
+  /* NOTE: This function is only used during binary BVH builder, and it's
    * supposed to be configured to have 2 children which will be filled-in in a
    * bit. But this is important to have children reset to NULL. */
   explicit InnerNode(const BoundBox &bounds) : BVHNode(bounds), num_children_(0)
diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake
index a470fb9c574..957b702fd3a 100644
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -88,7 +88,7 @@ endmacro()
 
 function(cycles_link_directories)
   if(APPLE)
-    # APPLE plaform uses full paths for linking libraries, and avoids link_directories.
+    # APPLE platform uses full paths for linking libraries, and avoids link_directories.
     return()
   endif()
 
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
index f11b49ef65f..5aabed8702a 100644
--- a/intern/cycles/device/cpu/device.cpp
+++ b/intern/cycles/device/cpu/device.cpp
@@ -38,7 +38,6 @@ void device_cpu_info(vector<DeviceInfo> &devices)
   info.id = "CPU";
   info.num = 0;
   info.has_osl = true;
-  info.has_half_images = true;
   info.has_nanovdb = true;
   info.has_profiling = true;
   if (openimagedenoise_supported()) {
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index 68dec7f0af2..2ad76de70ca 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -93,11 +93,6 @@ CPUDevice::~CPUDevice()
   texture_info.free();
 }
 
-bool CPUDevice::show_samples() const
-{
-  return (info.cpu_threads == 1);
-}
-
 BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
 {
   BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
index 90d217bb624..6f9452a6378 100644
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -60,8 +60,6 @@ class CPUDevice : public Device {
   CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
   ~CPUDevice();
 
-  virtual bool show_samples() const override;
-
   virtual BVHLayoutMask get_bvh_layout_mask() const override;
 
   /* Returns true if the texture info was copied to the device (meaning, some more
diff --git a/intern/cycles/device/cuda/device.cpp b/intern/cycles/device/cuda/device.cpp
index af2bdc6e29c..0d9e6c72466 100644
--- a/intern/cycles/device/cuda/device.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -144,7 +144,6 @@ void device_cuda_info(vector<DeviceInfo> &devices)
     info.description = string(name);
     info.num = num;
 
-    info.has_half_images = (major >= 3);
     info.has_nanovdb = true;
     info.denoisers = 0;
 
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 20945796a2d..e05fef3897c 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -46,12 +46,6 @@ bool CUDADevice::have_precompiled_kernels()
   return path_exists(cubins_path);
 }
 
-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
 BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
 {
   return BVH_LAYOUT_BVH2;
@@ -242,6 +236,10 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
   cflags += " -DWITH_NANOVDB";
 #  endif
 
+#  ifdef WITH_CYCLES_DEBUG
+  cflags += " -DWITH_CYCLES_DEBUG";
+#  endif
+
   return cflags;
 }
 
@@ -932,7 +930,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
 {
   CUDAContextScope scope(this);
 
-  /* General variables for both architectures */
   string bind_name = mem.name;
   size_t dsize = datatype_size(mem.data_type);
   size_t size = mem.memory_size();
@@ -1095,7 +1092,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
 
   if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
       mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
     CUDA_RESOURCE_DESC resDesc;
     memset(&resDesc, 0, sizeof(resDesc));
 
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
index 72d4108d1bf..4c357d0b5ab 100644
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -76,8 +76,6 @@ class CUDADevice : public Device {
 
   static bool have_precompiled_kernels();
 
-  virtual bool show_samples() const override;
-
   virtual BVHLayoutMask get_bvh_layout_mask() const override;
 
   void set_error(const string &error) override;
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 63d0a49d3eb..bfbcdb20d5e 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -286,7 +286,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   info.description = "Multi Device";
   info.num = 0;
 
-  info.has_half_images = true;
   info.has_nanovdb = true;
   info.has_osl = true;
   info.has_profiling = true;
@@ -333,7 +332,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     }
 
     /* Accumulate device info. */
-    info.has_half_images &= device.has_half_images;
     info.has_nanovdb &= device.has_nanovdb;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 65188459c2c..346632de314 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -73,7 +73,6 @@ class DeviceInfo {
   int num;
   bool display_device;        /* GPU is used as a display device. */
   bool has_nanovdb;           /* Support NanoVDB volumes. */
-  bool has_half_images;       /* Support half-float textures. */
   bool has_osl;               /* Support Open Shading Language. */
   bool has_profiling;         /* Supports runtime collection of profiling info. */
   bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
@@ -90,7 +89,6 @@ class DeviceInfo {
     num = 0;
     cpu_threads = 0;
     display_device = false;
-    has_half_images = false;
     has_nanovdb = false;
     has_osl = false;
     has_profiling = false;
@@ -151,10 +149,6 @@ class Device {
     fprintf(stderr, "%s\n", error.c_str());
     fflush(stderr);
   }
-  virtual bool show_samples() const
-  {
-    return false;
-  }
   virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
 
   /* statistics */
diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp
index ecc109b2bb9..a9c7b1ba841 100644
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -148,7 +148,6 @@ void device_hip_info(vector<DeviceInfo> &devices)
     info.description = string(name);
     info.num = num;
 
-    info.has_half_images = true;
     info.has_nanovdb = true;
     info.denoisers = 0;
 
diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp
index 42dd5382ae5..53c4f3f0b3f 100644
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -47,12 +47,6 @@ bool HIPDevice::have_precompiled_kernels()
   return path_exists(fatbins_path);
 }
 
-bool HIPDevice::show_samples() const
-{
-  /* The HIPDevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
 BVHLayoutMask HIPDevice::get_bvh_layout_mask() const
 {
   return BVH_LAYOUT_BVH2;
@@ -243,7 +237,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
   hipGetDeviceProperties(&props, hipDevId);
 
   /* gcnArchName can contain tokens after the arch name with features, ie.
-    "gfx1010:sramecc-:xnack-" so we tokenize it to get the first part. */
+   * `gfx1010:sramecc-:xnack-` so we tokenize it to get the first part. */
   char *arch = strtok(props.gcnArchName, ":");
   if (arch == NULL) {
     arch = props.gcnArchName;
@@ -374,10 +368,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
 
 bool HIPDevice::load_kernels(const uint kernel_features)
 {
-  /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
+  /* TODO(sergey): Support kernels re-load for HIP devices adaptive compile.
    *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
+   * Currently re-loading kernels will invalidate memory pointers.
    */
   if (hipModule) {
     if (use_adaptive_compilation()) {
@@ -899,7 +892,6 @@ void HIPDevice::tex_alloc(device_texture &mem)
 {
   HIPContextScope scope(this);
 
-  /* General variables for both architectures */
   string bind_name = mem.name;
   size_t dsize = datatype_size(mem.data_type);
   size_t size = mem.memory_size();
diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h
index eb832ad828c..08a7be57e9c 100644
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -75,8 +75,6 @@ class HIPDevice : public Device {
 
   static bool have_precompiled_kernels();
 
-  virtual bool show_samples() const override;
-
   virtual BVHLayoutMask get_bvh_layout_mask() const override;
 
   void set_error(const string &error) override;
@@ -93,9 +91,7 @@ class HIPDevice : public Device {
 
   virtual string compile_kernel_get_common_cflags(const uint kernel_features);
 
-  string compile_kernel(const uint kernel_features,
-                        const char *name,
-                        const char *base = "hip");
+  string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip");
 
   virtual bool load_kernels(const uint kernel_features) override;
   void reserve_local_memory(const uint kernel_features);
diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h
index 8314405e670..71c6893edbd 100644
--- a/intern/cycles/device/hip/graphics_interop.h
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -48,7 +48,7 @@ class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
   HIPDeviceQueue *queue_ = nullptr;
   HIPDevice *device_ = nullptr;
 
-  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  /* OpenGL PBO which is currently registered as the destination for the HIP buffer. */
   uint opengl_pbo_id_ = 0;
   /* Buffer area in pixels of the corresponding PBO. */
   int64_t buffer_area_ = 0;
diff --git a/intern/cycles/device/memory.cpp b/intern/cycles/device/memory.cpp
index 86bf2542c92..ba2d993fb9e 100644
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 device_memory::device_memory(Device *device, const char *name, MemoryType type)
     : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements_cpu),
+      data_elements(device_type_traits<uchar>::num_elements),
       data_size(0),
       device_size(0),
       data_width(0),
diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h
index e04142117aa..cb22c191656 100644
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -81,155 +81,140 @@ static constexpr size_t datatype_size(DataType datatype)
 
 template<typename T> struct device_type_traits {
   static const DataType data_type = TYPE_UNKNOWN;
-  static const size_t num_elements_cpu = sizeof(T);
-  static const size_t num_elements_gpu = sizeof(T);
+  static const size_t num_elements = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
   static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uchar) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar2> {
   static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(uchar2) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar3> {
   static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 3;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(uchar3) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar4> {
   static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(uchar4) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint> {
   static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint2> {
   static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(uint2) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint3> {
   static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 3;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(uint3) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint4> {
   static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(uint4) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int> {
   static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(int) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int2> {
   static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(int2) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int3> {
   static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(int3) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int4> {
   static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(int4) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float> {
   static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(float) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float2> {
   static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(float2) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float3> {
+  /* float3 has different size depending on the device, can't use it for interchanging
+   * memory between CPU and GPU.
+   *
+   * Leave body empty to trigger a compile error if used. */
+};
+
+template<> struct device_type_traits<packed_float3> {
   static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(packed_float3) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float4> {
   static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(float4) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half> {
   static const DataType data_type = TYPE_HALF;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(half) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<ushort4> {
   static const DataType data_type = TYPE_UINT16;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(ushort4) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint16_t> {
   static const DataType data_type = TYPE_UINT16;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint16_t) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half4> {
   static const DataType data_type = TYPE_HALF;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(half4) == num_elements * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint64_t> {
   static const DataType data_type = TYPE_UINT64;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint64_t) == num_elements * datatype_size(data_type));
 };
 
 /* Device Memory
@@ -325,9 +310,7 @@ template<typename T> class device_only_memory : public device_memory {
       : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
-                                          device_type_traits<T>::num_elements_gpu,
-                        1);
+    data_elements = max(device_type_traits<T>::num_elements, 1);
   }
 
   device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -383,15 +366,11 @@ template<typename T> class device_only_memory : public device_memory {
 
 template<typename T> class device_vector : public device_memory {
  public:
-  /* Can only use this for types that have the same size on CPU and GPU. */
-  static_assert(device_type_traits<T>::num_elements_cpu ==
-                device_type_traits<T>::num_elements_gpu);
-
   device_vector(Device *device, const char *name, MemoryType type)
       : device_memory(device, name, type)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements_cpu;
+    data_elements = device_type_traits<T>::num_elements;
     modified = true;
     need_realloc_ = true;
 
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index 56efec3e131..e319246d4f4 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -109,14 +109,6 @@ class MultiDevice : public Device {
     return error_msg;
   }
 
-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
   virtual BVHLayoutMask get_bvh_layout_mask() const override
   {
     BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 6e897e3831f..b82b1281eb8 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -208,11 +208,15 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   }
   else {
     module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
-    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
   }
 
   module_options.boundValues = nullptr;
   module_options.numBoundValues = 0;
+#  if OPTIX_ABI_VERSION >= 55
+  module_options.payloadTypes = nullptr;
+  module_options.numPayloadTypes = 0;
+#  endif
 
   OptixPipelineCompileOptions pipeline_options = {};
   /* Default to no motion blur and two-level graph, since it is the fastest option. */
@@ -227,7 +231,11 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
   if (kernel_features & KERNEL_FEATURE_HAIR) {
     if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+#  if OPTIX_ABI_VERSION >= 55
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM;
+#  else
       pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+#  endif
     }
     else
       pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
@@ -324,7 +332,13 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
       /* Built-in thick curve intersection. */
       OptixBuiltinISOptions builtin_options = {};
+#  if OPTIX_ABI_VERSION >= 55
+      builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
+      builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+      builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable endcaps. */
+#  else
       builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+#  endif
       builtin_options.usesMotionBlur = false;
 
       optix_assert(optixBuiltinISModuleGet(
@@ -411,7 +425,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
   }
   else {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
   }
 
   if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
@@ -1178,6 +1192,15 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
               int ka = max(k0 - 1, curve.first_key);
               int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
 
+              index_data[i] = i * 4;
+              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+
+#  if OPTIX_ABI_VERSION >= 55
+              v[0] = make_float4(keys[ka].x, keys[ka].y, keys[ka].z, curve_radius[ka]);
+              v[1] = make_float4(keys[k0].x, keys[k0].y, keys[k0].z, curve_radius[k0]);
+              v[2] = make_float4(keys[k1].x, keys[k1].y, keys[k1].z, curve_radius[k1]);
+              v[3] = make_float4(keys[kb].x, keys[kb].y, keys[kb].z, curve_radius[kb]);
+#  else
               const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
               const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
               const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
@@ -1190,8 +1213,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
               static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
               static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
 
-              index_data[i] = i * 4;
-              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
               v[0] = make_float4(
                   dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
               v[1] = make_float4(
@@ -1200,6 +1221,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
                   dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
               v[3] = make_float4(
                   dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+#  endif
             }
             else {
               BoundBox bounds = BoundBox::empty;
@@ -1241,7 +1263,11 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       OptixBuildInput build_input = {};
       if (hair->curve_shape == CURVE_THICK) {
         build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+#  if OPTIX_ABI_VERSION >= 55
+        build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
+#  else
         build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+#  endif
         build_input.curveArray.numPrimitives = num_segments;
         build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
         build_input.curveArray.numVertices = num_vertices;
@@ -1422,9 +1448,12 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
           instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
         }
       }
-      else {
-        /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
-         * since it needs to filter out end-caps there).
+#  if OPTIX_ABI_VERSION < 55
+      /* Cannot disable any-hit program for thick curves, since it needs to filter out endcaps. */
+      else
+#  endif
+      {
+        /* Can disable __anyhit__kernel_optix_visibility_test by default.
          * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
          * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
          */
@@ -1494,9 +1523,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
         cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
         delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
-        /* Disable instance transform if object uses motion transform already. */
-        instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
         /* Get traversable handle to motion transform. */
         optixConvertPointerToTraversableHandle(context,
                                                motion_transform_gpu,
@@ -1510,10 +1536,6 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
           /* Set transform matrix. */
           memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
         }
-        else {
-          /* Disable instance transform if geometry already has it applied to vertex data. */
-          instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-        }
       }
     }
 
diff --git a/intern/cycles/doc/license/readme.txt b/intern/cycles/doc/license/readme.txt
index cc5476e3511..f610d800ebc 100644
--- a/intern/cycles/doc/license/readme.txt
+++ b/intern/cycles/doc/license/readme.txt
@@ -3,7 +3,7 @@ This program uses code from various sources, the default license is Apache 2.0
 for all code, with the following exceptions.
 
 Modified BSD License
-* Code adapated from Open Shading Language
+* Code adapted from Open Shading Language
 * Sobol direction vectors
 * Matrix inversion code from OpenEXR
 * MD5 Hash code
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 92bf8e69d19..ec90681b78a 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -380,7 +380,10 @@ void PathTrace::path_trace(RenderWork &render_work)
     PathTraceWork *path_trace_work = path_trace_works_[i].get();
 
     PathTraceWork::RenderStatistics statistics;
-    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+    path_trace_work->render_samples(statistics,
+                                    render_work.path_trace.start_sample,
+                                    num_samples,
+                                    render_work.path_trace.sample_offset);
 
     const double work_time = time_dt() - work_start_time;
     work_balance_infos_[i].time_spent += work_time;
@@ -850,7 +853,8 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work)
     const uint64_t num_samples_added = uint64_t(tile_size.x) * tile_size.y *
                                        render_work.path_trace.num_samples;
     const int current_sample = render_work.path_trace.start_sample +
-                               render_work.path_trace.num_samples;
+                               render_work.path_trace.num_samples -
+                               render_work.path_trace.sample_offset;
     progress_->add_samples(num_samples_added, current_sample);
   }
 
diff --git a/intern/cycles/integrator/path_trace_display.h b/intern/cycles/integrator/path_trace_display.h
index b69ee85fbbc..47014f43afa 100644
--- a/intern/cycles/integrator/path_trace_display.h
+++ b/intern/cycles/integrator/path_trace_display.h
@@ -76,7 +76,7 @@ class PathTraceDisplay {
 
   /* Copy buffer of rendered pixels of a given size into a given position of the texture.
    *
-   * This function does not acquire a lock. The reason for this is is to allow use of this function
+   * This function does not acquire a lock. The reason for this is to allow use of this function
    * for partial updates from different devices. In this case the caller will acquire the lock
    * once, update all the slices and release
    * the lock once. This will ensure that draw() will never use partially updated texture. */
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index 0dc7cd2f896..2ebfc913580 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -75,7 +75,10 @@ class PathTraceWork {
 
   /* Render given number of samples as a synchronous blocking call.
    * The samples are added to the render buffer associated with this work. */
-  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num,
+                              int sample_offset) = 0;
 
   /* Copy render result from this work to the corresponding place of the GPU display.
    *
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index 12dcc899dbb..2f6c3cf5aca 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -71,7 +71,8 @@ void PathTraceWorkCPU::init_execution()
 
 void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
                                       int start_sample,
-                                      int samples_num)
+                                      int samples_num,
+                                      int sample_offset)
 {
   const int64_t image_width = effective_buffer_params_.width;
   const int64_t image_height = effective_buffer_params_.height;
@@ -99,6 +100,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
       work_tile.w = 1;
       work_tile.h = 1;
       work_tile.start_sample = start_sample;
+      work_tile.sample_offset = sample_offset;
       work_tile.num_samples = 1;
       work_tile.offset = effective_buffer_params_.offset;
       work_tile.stride = effective_buffer_params_.stride;
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index 6e734690811..63ab686588c 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -48,7 +48,8 @@ class PathTraceWorkCPU : public PathTraceWork {
 
   virtual void render_samples(RenderStatistics &statistics,
                               int start_sample,
-                              int samples_num) override;
+                              int samples_num,
+                              int sample_offset) override;
 
   virtual void copy_to_display(PathTraceDisplay *display,
                                PassMode pass_mode,
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index b9784f68f56..956aa6a8c90 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -250,7 +250,8 @@ void PathTraceWorkGPU::init_execution()
 
 void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
                                       int start_sample,
-                                      int samples_num)
+                                      int samples_num,
+                                      int sample_offset)
 {
   /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
    * add more work (because tiles are smaller, so there is higher chance that more paths will
@@ -261,6 +262,7 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
   work_tile_scheduler_.reset(effective_buffer_params_,
                              start_sample,
                              samples_num,
+                             sample_offset,
                              device_scene_->data.integrator.scrambling_distance);
 
   enqueue_reset();
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index c5e291e72db..5aa497c26e7 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -46,7 +46,8 @@ class PathTraceWorkGPU : public PathTraceWork {
 
   virtual void render_samples(RenderStatistics &statistics,
                               int start_sample,
-                              int samples_num) override;
+                              int samples_num,
+                              int sample_offset) override;
 
   virtual void copy_to_display(PathTraceDisplay *display,
                                PassMode pass_mode,
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
index 276453f7aec..971173a5e96 100644
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -88,6 +88,16 @@ int RenderScheduler::get_num_samples() const
   return num_samples_;
 }
 
+void RenderScheduler::set_sample_offset(int sample_offset)
+{
+  sample_offset_ = sample_offset;
+}
+
+int RenderScheduler::get_sample_offset() const
+{
+  return sample_offset_;
+}
+
 void RenderScheduler::set_time_limit(double time_limit)
 {
   time_limit_ = time_limit;
@@ -110,13 +120,15 @@ int RenderScheduler::get_num_rendered_samples() const
   return state_.num_rendered_samples;
 }
 
-void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples, int sample_offset)
 {
   buffer_params_ = buffer_params;
 
   update_start_resolution_divider();
 
   set_num_samples(num_samples);
+  set_start_sample(sample_offset);
+  set_sample_offset(sample_offset);
 
   /* In background mode never do lower resolution render preview, as it is not really supported
    * by the software. */
@@ -171,7 +183,7 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
 
 void RenderScheduler::reset_for_next_tile()
 {
-  reset(buffer_params_, num_samples_);
+  reset(buffer_params_, num_samples_, sample_offset_);
 }
 
 bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
@@ -317,6 +329,7 @@ RenderWork RenderScheduler::get_render_work()
 
   render_work.path_trace.start_sample = get_start_sample_to_path_trace();
   render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+  render_work.path_trace.sample_offset = get_sample_offset();
 
   render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
 
@@ -835,7 +848,7 @@ int RenderScheduler::get_num_samples_to_path_trace() const
      * When time limit is not used the number of samples per render iteration is either increasing
      * or stays the same, so there is no need to clamp number of samples calculated for occupancy.
      */
-    if (time_limit_ && state_.start_render_time) {
+    if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
       const double remaining_render_time = max(
           0.0, time_limit_ - (time_dt() - state_.start_render_time));
       const double time_per_sample_average = path_trace_time_.get_average();
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
index d7b7413ae31..28f563c46e3 100644
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -39,6 +39,7 @@ class RenderWork {
   struct {
     int start_sample = 0;
     int num_samples = 0;
+    int sample_offset = 0;
   } path_trace;
 
   struct {
@@ -125,6 +126,9 @@ class RenderScheduler {
   void set_num_samples(int num_samples);
   int get_num_samples() const;
 
+  void set_sample_offset(int sample_offset);
+  int get_sample_offset() const;
+
   /* Time limit for the path tracing tasks, in minutes.
    * Zero disables the limit. */
   void set_time_limit(double time_limit);
@@ -150,7 +154,7 @@ class RenderScheduler {
 
   /* Reset scheduler, indicating that rendering will happen from scratch.
    * Resets current rendered state, as well as scheduling information. */
-  void reset(const BufferParams &buffer_params, int num_samples);
+  void reset(const BufferParams &buffer_params, int num_samples, int sample_offset);
 
   /* Reset scheduler upon switching to a next tile.
    * Will keep the same number of samples and full-frame render parameters, but will reset progress
@@ -419,6 +423,8 @@ class RenderScheduler {
   int start_sample_ = 0;
   int num_samples_ = 0;
 
+  int sample_offset_ = 0;
+
   /* Limit in seconds for how long path tracing is allowed to happen.
    * Zero means no limit is applied. */
   double time_limit_ = 0.0;
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
index 2d1ac07db7f..d60f7149bf4 100644
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -36,6 +36,7 @@ void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 void WorkTileScheduler::reset(const BufferParams &buffer_params,
                               int sample_start,
                               int samples_num,
+                              int sample_offset,
                               float scrambling_distance)
 {
   /* Image buffer parameters. */
@@ -51,6 +52,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
   /* Samples parameters. */
   sample_start_ = sample_start;
   samples_num_ = samples_num;
+  sample_offset_ = sample_offset;
 
   /* Initialize new scheduling. */
   reset_scheduler_state();
@@ -111,6 +113,7 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_
   work_tile.h = tile_size_.height;
   work_tile.start_sample = sample_start_ + start_sample;
   work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.sample_offset = sample_offset_;
   work_tile.offset = offset_;
   work_tile.stride = stride_;
 
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
index d9fa7e84431..2d6395799f7 100644
--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -41,6 +41,7 @@ class WorkTileScheduler {
   void reset(const BufferParams &buffer_params,
              int sample_start,
              int samples_num,
+             int sample_offset,
              float scrambling_distance);
 
   /* Get work for a device.
@@ -79,6 +80,7 @@ class WorkTileScheduler {
    * (splitting into a smaller work tiles). */
   int sample_start_ = 0;
   int samples_num_ = 0;
+  int sample_offset_ = 0;
 
   /* Tile size which be scheduled for rendering. */
   TileSize tile_size_;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 1a254f5eddc..d759399b04d 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP
   device/hip/kernel.cpp
 )
 
+set(SRC_KERNEL_DEVICE_METAL
+  device/metal/kernel.metal
+)
+
 set(SRC_KERNEL_DEVICE_OPTIX
   device/optix/kernel.cu
   device/optix/kernel_shader_raytrace.cu
@@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
   device/optix/globals.h
 )
 
+set(SRC_KERNEL_DEVICE_METAL_HEADERS
+  device/metal/compat.h
+  device/metal/context_begin.h
+  device/metal/context_end.h
+  device/metal/globals.h
+)
+
 set(SRC_KERNEL_CLOSURE_HEADERS
   closure/alloc.h
   closure/bsdf.h
@@ -262,6 +273,7 @@ set(SRC_KERNEL_UTIL_HEADERS
 )
 
 set(SRC_KERNEL_TYPES_HEADERS
+  tables.h
   textures.h
   types.h
 )
@@ -399,12 +411,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
       -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
       --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
-
-    if(${experimental})
-      set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
-      set(name ${name}_experimental)
-    endif()
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}
+      -Wno-deprecated-gpu-targets)
 
     if(WITH_NANOVDB)
       set(cuda_flags ${cuda_flags}
@@ -412,6 +420,10 @@ if(WITH_CYCLES_CUDA_BINARIES)
         -I "${NANOVDB_INCLUDE_DIR}")
     endif()
 
+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+    endif()
+
     if(WITH_CYCLES_CUBIN_COMPILER)
       string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)
 
@@ -560,11 +572,6 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
       -ffast-math
       -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})
 
-    if(${experimental})
-      set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
-      set(name ${name}_experimental)
-    endif()
-
     if(WITH_NANOVDB)
       set(hip_flags ${hip_flags}
         -D WITH_NANOVDB
@@ -572,7 +579,7 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
     endif()
 
     if(WITH_CYCLES_DEBUG)
-      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+      set(hip_flags ${hip_flags} -D WITH_CYCLES_DEBUG)
     endif()
 
     add_custom_command(
@@ -613,6 +620,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         -I "${NANOVDB_INCLUDE_DIR}")
     endif()
 
+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+    endif()
+
     if(WITH_CYCLES_CUBIN_COMPILER)
       # Needed to find libnvrtc-builtins.so. Can't do it from inside
       # cycles_cubin_cc since the env variable is read before main()
@@ -701,7 +712,7 @@ if(WITH_COMPILER_ASAN)
     string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=all")
     string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
   elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
-    # With OSL, Cycles disables rtti in some modules, wich then breaks at linking
+    # With OSL, Cycles disables rtti in some modules, which then breaks at linking
     # when trying to use vptr sanitizer (included into 'undefined' general option).
     string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=vptr")
     string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
@@ -729,12 +740,14 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_KERNEL_DEVICE_CUDA}
   ${SRC_KERNEL_DEVICE_HIP}
   ${SRC_KERNEL_DEVICE_OPTIX}
+  ${SRC_KERNEL_DEVICE_METAL}
   ${SRC_KERNEL_HEADERS}
   ${SRC_KERNEL_DEVICE_CPU_HEADERS}
   ${SRC_KERNEL_DEVICE_GPU_HEADERS}
   ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
   ${SRC_KERNEL_DEVICE_HIP_HEADERS}
   ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
+  ${SRC_KERNEL_DEVICE_METAL_HEADERS}
 )
 
 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -746,6 +759,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_
 source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
+source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -778,6 +792,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index 8686f887021..26ba136dd79 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
     swapped = false;
     for (int j = 0; j < num_hits - 1; ++j) {
       if (hits[j].t > hits[j + 1].t) {
-        struct Intersection tmp_hit = hits[j];
+        Intersection tmp_hit = hits[j];
         float3 tmp_Ng = Ng[j];
         hits[j] = hits[j + 1];
         Ng[j] = Ng[j + 1];
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index dd0327b3f94..746e48b9880 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -18,6 +18,7 @@
 
 #pragma once
 
+#include "kernel/tables.h"
 #include "kernel/types.h"
 #include "kernel/util/profiling.h"
 
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 1ee82e6eb7c..658dec102b1 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -52,8 +52,9 @@ typedef unsigned long long uint64_t;
 #endif
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -75,6 +76,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -84,7 +86,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
index 46196dcdb51..003881d7912 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,12 +92,29 @@
 
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                    (block_num_threads * thread_num_registers))
 
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */
 
 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 95a37c693ae..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 
 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
 
@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 56fcc38b907..22e2a61a06d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,13 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
+/* Include constant tables before entering Metal's context class scope (context_begin.h) */
+#include "kernel/tables.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_begin.h"
+#endif
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -40,6 +47,11 @@
 #include "kernel/bake/bake.h"
 
 #include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_end.h"
+#endif
+
 #include "kernel/film/read.h"
 
 /* --------------------------------------------------------------------
@@ -47,7 +59,7 @@
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_reset(int num_states)
+    ccl_gpu_kernel_signature(integrator_reset, int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
@@ -58,10 +70,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
-                                           const int num_tiles,
-                                           float *render_buffer,
-                                           const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_camera,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -72,7 +85,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -83,14 +96,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
-                                         const int num_tiles,
-                                         float *render_buffer,
-                                         const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_bake,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -101,7 +116,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -112,230 +127,264 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_closest(const int *path_index_array,
-                                            ccl_global float *render_buffer,
-                                            const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_closest,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_closest(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_shadow,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_shadow(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_subsurface(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_volume_stack(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_background(const int *path_index_array,
-                                           float *render_buffer,
-                                           const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_background,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_background(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_light(const int *path_index_array,
-                                      float *render_buffer,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_light,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_light(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_shadow,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_shadow(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface(const int *path_index_array,
-                                        float *render_buffer,
-                                        const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
-                                                 float *render_buffer,
-                                                 const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_volume(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_volume,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_volume(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_paths_array(int num_states,
-                                             int *indices,
-                                             int *num_indices,
-                                             int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
-                                                    int *indices,
-                                                    int *num_indices,
-                                                    int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_active_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_paths_array(int num_states,
-                                                 int *indices,
-                                                 int *num_indices,
-                                                 int indices_offset)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
-      });
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
-                                                        int *indices,
-                                                        int *num_indices,
-                                                        int indices_offset)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
-      });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(int num_states,
-                                             int num_states_limit,
-                                             int *indices,
-                                             int *num_indices,
-                                             int *key_counter,
-                                             int *key_prefix_sum,
-                                             int kernel)
-{
-  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states,
-      num_states_limit,
-      indices,
-      num_indices,
-      key_counter,
-      key_prefix_sum,
-      [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
-                   INTEGRATOR_STATE(state, path, shader_sort_key) :
-                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
-      });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_paths_array(int num_states,
-                                              int *indices,
-                                              int *num_indices,
-                                              int num_active_paths)
-{
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+                             int num_states,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             ccl_global int *key_counter,
+                             ccl_global int *key_prefix_sum,
+                             int kernel_index)
+{
+  ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+                            INTEGRATOR_STATE(state, path, shader_sort_key) :
+                            GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
+  const uint state_index = ccl_gpu_global_id_x();
+  gpu_parallel_sorted_index_array(state_index,
+                                  num_states,
+                                  num_states_limit,
+                                  indices,
+                                  num_indices,
+                                  key_counter,
+                                  key_prefix_sum,
+                                  ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int num_active_paths)
+{
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
-                                         const int active_states_offset,
-                                         const int terminated_states_offset,
-                                         const int work_size)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -343,28 +392,32 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
-                                                     int *indices,
-                                                     int *num_indices,
-                                                     int num_active_paths)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
-                                                const int active_states_offset,
-                                                const int terminated_states_offset,
-                                                const int work_size)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -372,15 +425,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_shadow_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
+ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+    prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
-      counter, prefix_sum, num_values);
+  gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
@@ -388,16 +440,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
-                                                   int sx,
-                                                   int sy,
-                                                   int sw,
-                                                   int sh,
-                                                   float threshold,
-                                                   bool reset,
-                                                   int offset,
-                                                   int stride,
-                                                   uint *num_active_pixels)
+    ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             float threshold,
+                             bool reset,
+                             int offset,
+                             int stride,
+                             ccl_global uint *num_active_pixels)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / sw;
@@ -406,37 +459,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = kernel_adaptive_sampling_convergence_check(
-        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_x(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int y = ccl_gpu_global_id_x();
 
   if (y < sh) {
-    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_y(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int x = ccl_gpu_global_id_x();
 
   if (x < sw) {
-    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 
@@ -445,12 +512,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+    ccl_gpu_kernel_signature(cryptomatte_postprocess,
+                             ccl_global float *render_buffer,
+                             int num_pixels)
 {
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 
@@ -458,36 +527,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  * Film.
  */
 
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
-                                                      float *pixels,
-                                                      float *render_buffer,
-                                                      int num_pixels,
-                                                      int width,
-                                                      int offset,
-                                                      int stride,
-                                                      int dst_offset,
-                                                      int dst_stride,
-                                                      const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  ccl_global float *pixel = pixels +
-                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
-  processor(kfilm_convert, buffer, pixel);
-}
-
 ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
                                                           const int rgba_offset,
                                                           const int rgba_stride,
@@ -508,177 +547,95 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
 #endif
 }
 
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  float pixel[4];
-  processor(kfilm_convert, buffer, pixel);
-
-  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
-  const half4 half_pixel = float4_to_half4_display(
-      make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-  kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel);
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        processor(kfilm_convert, buffer, pixel_rgba);
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        float value;
-        processor(kfilm_convert, buffer, &value);
-
-        pixel_rgba[0] = value;
-        pixel_rgba[1] = value;
-        pixel_rgba[2] = value;
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
-  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
-  (const KernelFilmConvert kfilm_convert, \
-   float *pixels, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global float *pixels, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_common(&kfilm_convert, \
-                                   pixels, \
-                                   render_buffer, \
-                                   num_pixels, \
-                                   width, \
-                                   offset, \
-                                   stride, \
-                                   rgba_offset, \
-                                   rgba_stride, \
-                                   film_get_pass_pixel_##variant); \
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    ccl_global float *pixel = pixels + \
+                              (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
+\
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
   } \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
-  (const KernelFilmConvert kfilm_convert, \
-   uchar4 *rgba, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+\
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
-                                                        rgba, \
-                                                        render_buffer, \
-                                                        num_pixels, \
-                                                        width, \
-                                                        offset, \
-                                                        stride, \
-                                                        rgba_offset, \
-                                                        rgba_stride, \
-                                                        film_get_pass_pixel_##variant); \
-  }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    const half4 half_pixel = float4_to_half4_display( \
+        make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+    kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
+  }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
+
+#undef KERNEL_FILM_CONVERT_VARIANT
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
@@ -687,42 +644,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 /* Displacement */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float *output,
-                                    const int offset,
-                                    const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_displace,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_displace_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Background */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float *output,
-                                      const int offset,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_background,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_background_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Curve Shadow Transparency */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
-                                                     float *output,
-                                                     const int offset,
-                                                     const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(
+        kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
   }
 }
 
@@ -731,15 +692,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_preprocess(float *render_buffer,
-                                       int full_x,
-                                       int full_y,
-                                       int width,
-                                       int height,
-                                       int offset,
-                                       int stride,
-                                       int pass_stride,
-                                       int pass_denoised)
+    ccl_gpu_kernel_signature(filter_color_preprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int pass_denoised)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -750,31 +712,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
-  float *color_out = buffer + pass_denoised;
+  ccl_global float *color_out = buffer + pass_denoised;
   color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
   color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
   color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
-                                         int guiding_pass_stride,
-                                         int guiding_pass_albedo,
-                                         int guiding_pass_normal,
-                                         const float *render_buffer,
-                                         int render_offset,
-                                         int render_stride,
-                                         int render_pass_stride,
-                                         int render_pass_sample_count,
-                                         int render_pass_denoising_albedo,
-                                         int render_pass_denoising_normal,
-                                         int full_x,
-                                         int full_y,
-                                         int width,
-                                         int height,
-                                         int num_samples)
+    ccl_gpu_kernel_signature(filter_guiding_preprocess,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int guiding_pass_normal,
+                             ccl_global const float *render_buffer,
+                             int render_offset,
+                             int render_stride,
+                             int render_pass_stride,
+                             int render_pass_sample_count,
+                             int render_pass_denoising_albedo,
+                             int render_pass_denoising_normal,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int num_samples)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -785,10 +748,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
   const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
-  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+  ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
 
   float pixel_scale;
   if (render_pass_sample_count == PASS_UNUSED) {
@@ -802,8 +765,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_albedo != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
 
-    const float *aledo_in = buffer + render_pass_denoising_albedo;
-    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+    ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+    ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
     albedo_out[0] = aledo_in[0] * pixel_scale;
     albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -814,8 +777,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_normal != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
 
-    const float *normal_in = buffer + render_pass_denoising_normal;
-    float *normal_out = guiding_pixel + guiding_pass_normal;
+    ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+    ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
 
     normal_out[0] = normal_in[0] * pixel_scale;
     normal_out[1] = normal_in[1] * pixel_scale;
@@ -824,11 +787,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
-                                              int guiding_pass_stride,
-                                              int guiding_pass_albedo,
-                                              int width,
-                                              int height)
+    ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int width,
+                             int height)
 {
   kernel_assert(guiding_pass_albedo != PASS_UNUSED);
 
@@ -841,9 +805,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
-  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+  ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
   albedo_out[0] = 0.5f;
   albedo_out[1] = 0.5f;
@@ -851,20 +815,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_postprocess(float *render_buffer,
-                                        int full_x,
-                                        int full_y,
-                                        int width,
-                                        int height,
-                                        int offset,
-                                        int stride,
-                                        int pass_stride,
-                                        int num_samples,
-                                        int pass_noisy,
-                                        int pass_denoised,
-                                        int pass_sample_count,
-                                        int num_components,
-                                        bool use_compositing)
+    ccl_gpu_kernel_signature(filter_color_postprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int num_samples,
+                             int pass_noisy,
+                             int pass_denoised,
+                             int pass_sample_count,
+                             int num_components,
+                             bool use_compositing)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -875,7 +840,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
   float pixel_scale;
   if (pass_sample_count == PASS_UNUSED) {
@@ -885,7 +850,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     pixel_scale = __float_as_uint(buffer[pass_sample_count]);
   }
 
-  float *denoised_pixel = buffer + pass_denoised;
+  ccl_global float *denoised_pixel = buffer + pass_denoised;
 
   denoised_pixel[0] *= pixel_scale;
   denoised_pixel[1] *= pixel_scale;
@@ -898,7 +863,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     /* Currently compositing passes are either 3-component (derived by dividing light passes)
      * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
      * simplifies logic and avoids extra memory allocation. */
-    const float *noisy_pixel = buffer + pass_noisy;
+    ccl_global const float *noisy_pixel = buffer + pass_noisy;
     denoised_pixel[3] = noisy_pixel[3];
   }
   else {
@@ -914,21 +879,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
-                                                               uint *num_possible_splits)
+    ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+                             int num_states,
+                             ccl_global uint *num_possible_splits)
 {
   const int state = ccl_gpu_global_id_x();
 
   bool can_split = false;
 
   if (state < num_states) {
-    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+    can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const auto can_split_mask = ccl_gpu_ballot(can_split);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index d7416beb783..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                 IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
   const uint warp_index = thread_index / ccl_gpu_warp_size;
   const uint num_warps = blocksize / ccl_gpu_warp_size;
 
-  /* Test if state corresponding to this thread is active. */
   const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
 
-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                        ccl_gpu_thread_mask(thread_warp));
 
-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
     }
 
-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-  }
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
 
-  ccl_gpu_syncthreads();
+    ccl_gpu_syncthreads();
 
-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
   }
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index 6de3a022569..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize>
-__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+                                        ccl_global int *counter,
+                                        ccl_global int *prefix_sum,
+                                        const int num_values)
 {
-  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+  if (global_id != 0) {
     return;
   }
 
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index c06d7be444f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+                                                const uint num_states,
                                                 const int num_states_limit,
-                                                int *indices,
-                                                int *num_indices,
-                                                int *key_counter,
-                                                int *key_prefix_sum,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
+                                                ccl_global int *key_counter,
+                                                ccl_global int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
-  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
   const int key = (state_index < num_states) ? get_key_op(state_index) :
                                                GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
 
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 282c3eca641..fff7a09e884 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -45,8 +45,9 @@ typedef unsigned long long uint64_t;
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -74,6 +75,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -83,7 +85,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot(predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 typedef hipTextureObject_t ccl_gpu_tex_object;
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
index 2fde0d46015..7ec744d8ad2 100644
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -35,12 +35,29 @@
 
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                    (block_num_threads * thread_num_registers))
 
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */
 
 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 77cea30914c..61597a4acfc 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -34,6 +34,7 @@ using namespace metal;
 
 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"
 
 /* Qualifiers */
 
@@ -42,8 +43,9 @@ using namespace metal;
 #define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device __attribute__((noinline))
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global device
-#define ccl_static_constant static constant constexpr
+#define ccl_inline_constant static constant constexpr
 #define ccl_device_constant constant
 #define ccl_constant const device
 #define ccl_gpu_shared threadgroup
@@ -58,6 +60,122 @@ using namespace metal;
 
 #define kernel_assert(cond)
 
+#define ccl_gpu_global_id_x() metal_global_id
+#define ccl_gpu_warp_size simdgroup_size
+#define ccl_gpu_thread_idx_x simd_group_index
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
+
+#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
+
+// clang-format off
+
+/* kernel.h adapters */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+/* Convert a comma-separated list into a semicolon-separated list
+ * (so that we can generate a struct based on kernel entry-point parameters). */
+#define FN0()
+#define FN1(p1) p1;
+#define FN2(p1, p2) p1; p2;
+#define FN3(p1, p2, p3) p1; p2; p3;
+#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
+#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
+#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
+#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
+#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
+#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
+#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
+#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
+#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
+#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
+#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
+#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
+#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+
+/* Generate a struct containing the entry-point parameters and a "run"
+ * method which can access them implicitly via this-> */
+#define ccl_gpu_kernel_signature(name, ...) \
+struct kernel_gpu_##name \
+{ \
+  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
+  void run(thread MetalKernelContext& context, \
+           threadgroup int *simdgroup_offset, \
+           const uint metal_global_id, \
+           const ushort metal_local_id, \
+           const ushort metal_local_size, \
+           uint simdgroup_size, \
+           uint simd_lane_index, \
+           uint simd_group_index, \
+           uint num_simd_groups) ccl_global const; \
+}; \
+kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
+                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
+                                constant MetalAncillaries *_metal_ancillaries, \
+                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                const uint metal_global_id [[thread_position_in_grid]], \
+                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
+                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                uint simdgroup_size [[threads_per_simdgroup]], \
+                                uint simd_lane_index [[thread_index_in_simdgroup]], \
+                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
+                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
+  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
+  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+} \
+void kernel_gpu_##name::run(thread MetalKernelContext& context, \
+                  threadgroup int *simdgroup_offset, \
+                  const uint metal_global_id, \
+                  const ushort metal_local_id, \
+                  const ushort metal_local_size, \
+                  uint simdgroup_size, \
+                  uint simd_lane_index, \
+                  uint simd_group_index, \
+                  uint num_simd_groups) ccl_global const
+
+#define ccl_gpu_kernel_call(x) context.x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
+    ccl_private MetalKernelContext &context; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  } ccl_gpu_kernel_lambda_pass(context)
+
+// clang-format on
+
+/* volumetric lambda functions - use function objects for lambda-like functionality */
+#define VOLUME_READ_LAMBDA(function_call) \
+  struct FnObjectRead { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    VolumeStack operator()(const int i) const \
+    { \
+      return context->function_call; \
+    } \
+  } volume_read_lambda_pass{kg, this, state};
+
+#define VOLUME_WRITE_LAMBDA(function_call) \
+  struct FnObjectWrite { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    void operator()(const int i, VolumeStack entry) const \
+    { \
+      context->function_call; \
+    } \
+  } volume_write_lambda_pass{kg, this, state};
+
 /* make_type definitions with Metal style element initializers */
 #ifdef make_float2
 #  undef make_float2
@@ -112,6 +230,7 @@ using namespace metal;
 #define sinhf(x) sinh(float(x))
 #define coshf(x) cosh(float(x))
 #define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))
 
 /* Use native functions with possibly lower precision for performance,
  * no issues found so far. */
@@ -124,3 +243,43 @@ using namespace metal;
 #define logf(x) trigmode::log(float(x))
 
 #define NULL 0
+
+#define __device__
+
+/* texture bindings and sampler setup */
+
+struct Texture2DParamsMetal {
+  texture2d<float, access::sample> tex;
+};
+struct Texture3DParamsMetal {
+  texture3d<float, access::sample> tex;
+};
+
+struct MetalAncillaries {
+  device Texture2DParamsMetal *textures_2d;
+  device Texture3DParamsMetal *textures_3d;
+};
+
+#include "util/half.h"
+#include "util/types.h"
+
+enum SamplerType {
+  SamplerFilterNearest_AddressRepeat,
+  SamplerFilterNearest_AddressClampEdge,
+  SamplerFilterNearest_AddressClampZero,
+
+  SamplerFilterLinear_AddressRepeat,
+  SamplerFilterLinear_AddressClampEdge,
+  SamplerFilterLinear_AddressClampZero,
+
+  SamplerCount
+};
+
+constant constexpr array<sampler, SamplerCount> metal_samplers = {
+    sampler(address::repeat, filter::nearest),
+    sampler(address::clamp_to_edge, filter::nearest),
+    sampler(address::clamp_to_zero, filter::nearest),
+    sampler(address::repeat, filter::linear),
+    sampler(address::clamp_to_edge, filter::linear),
+    sampler(address::clamp_to_zero, filter::linear),
+};
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
new file mode 100644
index 00000000000..8c9e1c54077
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+
+/* Open the Metal kernel context class
+ * Necessary to access resource bindings */
+class MetalKernelContext {
+  public:
+    constant KernelParamsMetal &launch_params_metal;
+    constant MetalAncillaries *metal_ancillaries;
+
+    MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
+    : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
+    {}
+
+    /* texture fetch adapter functions */
+    typedef uint64_t ccl_gpu_tex_object;
+
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      kernel_assert(0);
+      return 0;
+    }
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      kernel_assert(0);
+      return 0;
+    }
+
+    // texture2d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+    }
+
+    // texture3d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+    }
+#    include "kernel/device/gpu/image.h"
+
+  // clang-format on
diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h
new file mode 100644
index 00000000000..e700f294440
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+}
+; /* end of MetalKernelContext class definition */
+
+/* Silently redirect into the MetalKernelContext instance */
+/* NOTE: These macros will need maintaining as entry-points change. */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state context.launch_params_metal.__integrator_state
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
new file mode 100644
index 00000000000..1aea36589d0
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+#include "kernel/integrator/state.h"
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct KernelParamsMetal {
+
+#define KERNEL_TEX(type, name) ccl_global const type *name;
+#include "kernel/textures.h"
+#undef KERNEL_TEX
+
+  const IntegratorStateGPU __integrator_state;
+  const KernelData data;
+
+} KernelParamsMetal;
+
+typedef struct KernelGlobalsGPU {
+  int unused[1];
+} KernelGlobalsGPU;
+
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data launch_params_metal.data
+#define kernel_integrator_state launch_params_metal.__integrator_state
+
+/* data lookup defines */
+
+#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
+#define kernel_tex_array(tex) launch_params_metal.tex
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
new file mode 100644
index 00000000000..feca20ff475
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Metal kernel entry points */
+
+// clang-format off
+
+#include "kernel/device/metal/compat.h"
+#include "kernel/device/metal/globals.h"
+#include "kernel/device/gpu/kernel.h"
+
+// clang-format on
+\ No newline at end of file
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index 835e4621d47..0619c135c39 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -49,10 +49,11 @@ typedef unsigned long long uint64_t;
   __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -76,6 +77,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -85,7 +87,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index b987aa7a817..70b977b3d84 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -21,6 +21,8 @@
 
 #include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
 
+#include "kernel/tables.h"
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -29,9 +31,11 @@
 #include "kernel/integrator/intersect_shadow.h"
 #include "kernel/integrator/intersect_subsurface.h"
 #include "kernel/integrator/intersect_volume_stack.h"
-
 // clang-format on
 
+#define OPTIX_DEFINE_ABI_VERSION_ONLY
+#include <optix_function_table.h>
+
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
 {
   return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
@@ -44,7 +48,7 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 ccl_device_forceinline int get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  /* Always get the the instance ID from the TLAS
+  /* Always get the instance ID from the TLAS
    * There might be a motion transform node between TLAS and BLAS which does not have one. */
   return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
@@ -159,9 +163,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 
   /* Record geometric normal. */
   const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
-  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
-  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
   local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
 
   /* Continue tracing (without this the trace call would return after the first hit). */
@@ -198,10 +202,12 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
     type = segment.type;
     prim = segment.prim;
 
+#    if OPTIX_ABI_VERSION < 55
     /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
       return optixIgnoreIntersection();
     }
+#    endif
   }
 #  endif
 
@@ -308,6 +314,7 @@ extern "C" __global__ void __anyhit__kernel_optix_volume_test()
 extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
 {
 #ifdef __HAIR__
+#  if OPTIX_ABI_VERSION < 55
   if (!optixIsTriangleHit()) {
     /* Filter out curve endcaps. */
     const float u = __uint_as_float(optixGetAttribute_0());
@@ -315,6 +322,7 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
       return optixIgnoreIntersection();
     }
   }
+#  endif
 #endif
 
 #ifdef __VISIBILITY_FLAG__
diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/accumulate.h
index d66d7d6fb70..fb52b1cd05f 100644
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -151,7 +151,8 @@ ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
 ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
                                           ConstIntegratorState state,
                                           ccl_global float *ccl_restrict render_buffer,
-                                          int sample)
+                                          int sample,
+                                          int sample_offset)
 {
   if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
     return sample;
@@ -159,7 +160,9 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
 
   ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
 
-  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
+  return atomic_fetch_and_add_uint32(
+             (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+         sample_offset;
 }
 
 ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
@@ -550,7 +553,7 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg,
                                                const bool is_transparent_background_ray,
                                                ccl_global float *ccl_restrict render_buffer)
 {
-  float3 contribution = INTEGRATOR_STATE(state, path, throughput) * L;
+  float3 contribution = float3(INTEGRATOR_STATE(state, path, throughput)) * L;
   kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
 
   ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
diff --git a/intern/cycles/kernel/geom/attribute.h b/intern/cycles/kernel/geom/attribute.h
index 848e0430caa..a7ac2bd926f 100644
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
  * Lookup of attributes is different between OSL and SVM, as OSL is ustring
  * based while for SVM we use integer ids. */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+}
 
 ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
@@ -106,9 +111,9 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals kg,
 {
   Transform tfm;
 
-  tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0);
-  tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1);
-  tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2);
+  tfm.x = kernel_tex_fetch(__attributes_float4, desc.offset + 0);
+  tfm.y = kernel_tex_fetch(__attributes_float4, desc.offset + 1);
+  tfm.z = kernel_tex_fetch(__attributes_float4, desc.offset + 2);
 
   return tfm;
 }
diff --git a/intern/cycles/kernel/geom/curve.h b/intern/cycles/kernel/geom/curve.h
index 7271193eef8..4b6eecf9640 100644
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -126,8 +126,8 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
     int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
     int k1 = k0 + 1;
 
-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
 
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -149,7 +149,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                 desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
     }
     else {
       return make_float3(0.0f, 0.0f, 0.0f);
@@ -168,8 +168,8 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
     int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
     int k1 = k0 + 1;
 
-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + k0);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + k1);
 
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -191,7 +191,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                 desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
     }
     else {
       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/motion_curve.h b/intern/cycles/kernel/geom/motion_curve.h
index 2dd213d43f6..8358c94360f 100644
--- a/intern/cycles/kernel/geom/motion_curve.h
+++ b/intern/cycles/kernel/geom/motion_curve.h
@@ -48,8 +48,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg,
 
     offset += step * numkeys;
 
-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
   }
 }
 
@@ -106,10 +106,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg,
 
     offset += step * numkeys;
 
-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
-    keys[2] = kernel_tex_fetch(__attributes_float3, offset + k2);
-    keys[3] = kernel_tex_fetch(__attributes_float3, offset + k3);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
+    keys[2] = kernel_tex_fetch(__attributes_float4, offset + k2);
+    keys[3] = kernel_tex_fetch(__attributes_float4, offset + k3);
   }
 }
 
diff --git a/intern/cycles/kernel/geom/motion_triangle.h b/intern/cycles/kernel/geom/motion_triangle.h
index 43f894938e0..62b7b630c89 100644
--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@@ -43,9 +43,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 {
   if (step == numsteps) {
     /* center step: regular vertex location */
-    verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+    verts[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+    verts[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+    verts[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
   }
   else {
     /* center step not store in this array */
@@ -54,9 +54,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 
     offset += step * numverts;
 
-    verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    verts[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    verts[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    verts[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
   }
 }
 
@@ -70,9 +70,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
 {
   if (step == numsteps) {
     /* center step: regular vertex location */
-    normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
   }
   else {
     /* center step is not stored in this array */
@@ -81,9 +81,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
 
     offset += step * numverts;
 
-    normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
   }
 }
 
diff --git a/intern/cycles/kernel/geom/motion_triangle_intersect.h b/intern/cycles/kernel/geom/motion_triangle_intersect.h
index 256e7add21e..72ad237eeeb 100644
--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -163,19 +163,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
   motion_triangle_vertices(kg, fobject, prim, time, verts);
   /* Ray-triangle intersection, unoptimized. */
   float t, u, v;
-  if (ray_triangle_intersect(P,
-                             dir,
-                             tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                             (ssef *)verts,
-#else
-                             verts[0],
-                             verts[1],
-                             verts[2],
-#endif
-                             &u,
-                             &v,
-                             &t)) {
+  if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
     /* Visibility flag test. we do it here under the assumption
      * that most triangles are culled by node flags.
@@ -229,19 +217,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
   motion_triangle_vertices(kg, local_object, prim, time, verts);
   /* Ray-triangle intersection, unoptimized. */
   float t, u, v;
-  if (!ray_triangle_intersect(P,
-                              dir,
-                              tmax,
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                              (ssef *)verts,
-#  else
-                              verts[0],
-                              verts[1],
-                              verts[2],
-#  endif
-                              &u,
-                              &v,
-                              &t)) {
+  if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
     return false;
   }
 
diff --git a/intern/cycles/kernel/geom/patch.h b/intern/cycles/kernel/geom/patch.h
index 7d24937a41e..432618aa243 100644
--- a/intern/cycles/kernel/geom/patch.h
+++ b/intern/cycles/kernel/geom/patch.h
@@ -380,7 +380,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals kg,
     *dv = make_float3(0.0f, 0.0f, 0.0f);
 
   for (int i = 0; i < num_control; i++) {
-    float3 v = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + indices[i]));
+    float3 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
 
     val += v * weights[i];
     if (du)
@@ -417,7 +417,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals kg,
     *dv = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
   for (int i = 0; i < num_control; i++) {
-    float4 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
+    float4 v = kernel_tex_fetch(__attributes_float4, offset + indices[i]);
 
     val += v * weights[i];
     if (du)
diff --git a/intern/cycles/kernel/geom/primitive.h b/intern/cycles/kernel/geom/primitive.h
index 7a8921b6d6e..6d7b550d82f 100644
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@@ -284,18 +284,33 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
     int numverts, numkeys;
     object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
 
-    /* lookup attributes */
-    motion_pre = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
-    desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE) ? numverts : numkeys;
-    motion_post = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
 #ifdef __HAIR__
-    if (is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
-      object_position_transform(kg, sd, &motion_pre);
-      object_position_transform(kg, sd, &motion_post);
+    if (is_curve_primitive) {
+      motion_pre = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+      desc.offset += numkeys;
+      motion_post = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+
+      /* Curve */
+      if ((sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+        object_position_transform(kg, sd, &motion_pre);
+        object_position_transform(kg, sd, &motion_post);
+      }
     }
+    else
 #endif
+        if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+      /* Triangle */
+      if (subd_triangle_patch(kg, sd) == ~0) {
+        motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+      else {
+        motion_pre = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+    }
   }
 
   /* object motion. note that depending on the mesh having motion vectors, this
diff --git a/intern/cycles/kernel/geom/subd_triangle.h b/intern/cycles/kernel/geom/subd_triangle.h
index 8a9a3f71231..1b693a915bf 100644
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -20,13 +20,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Patch index for triangle, -1 if not subdivision triangle */
-
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
-}
-
 /* UV coords of triangle within patch */
 
 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
@@ -443,8 +436,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
     if (dy)
       *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-    return float4_to_float3(
-        kernel_tex_fetch(__attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch)));
+    return kernel_tex_fetch(__attributes_float3,
+                            desc.offset + subd_triangle_patch_face(kg, patch));
   }
   else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
     float2 uv[3];
@@ -452,10 +445,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
 
     uint4 v = subd_triangle_patch_indices(kg, patch);
 
-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.x));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.y));
-    float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.z));
-    float3 f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.w));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
+    float3 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
+    float3 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -484,10 +477,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
 
     float3 f0, f1, f2, f3;
 
-    f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
-    f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
-    f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
-    f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
+    f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
+    f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
+    f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
+    f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -513,7 +506,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
     if (dy)
       *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+    return kernel_tex_fetch(__attributes_float3, desc.offset);
   }
   else {
     if (dx)
@@ -590,7 +583,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
     if (dy)
       *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
-    return kernel_tex_fetch(__attributes_float3,
+    return kernel_tex_fetch(__attributes_float4,
                             desc.offset + subd_triangle_patch_face(kg, patch));
   }
   else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
@@ -599,10 +592,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
 
     uint4 v = subd_triangle_patch_indices(kg, patch);
 
-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
-    float4 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
-    float4 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + v.x);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + v.y);
+    float4 f2 = kernel_tex_fetch(__attributes_float4, desc.offset + v.z);
+    float4 f3 = kernel_tex_fetch(__attributes_float4, desc.offset + v.w);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -642,10 +635,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
           color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset)));
     }
     else {
-      f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
-      f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
-      f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
-      f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
+      f0 = kernel_tex_fetch(__attributes_float4, corners[0] + desc.offset);
+      f1 = kernel_tex_fetch(__attributes_float4, corners[1] + desc.offset);
+      f2 = kernel_tex_fetch(__attributes_float4, corners[2] + desc.offset);
+      f3 = kernel_tex_fetch(__attributes_float4, corners[3] + desc.offset);
     }
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
@@ -672,7 +665,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
     if (dy)
       *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 
-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
   }
   else {
     if (dx)
diff --git a/intern/cycles/kernel/geom/triangle.h b/intern/cycles/kernel/geom/triangle.h
index 233e901c7ca..854022b3369 100644
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@@ -29,9 +29,9 @@ ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderDat
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-  const float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
 
   /* return normal */
   if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
@@ -54,9 +54,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
   /* compute point */
   float t = 1.0f - u - v;
   *P = (u * v0 + v * v1 + t * v2);
@@ -78,9 +78,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3])
 {
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
 }
 
 /* Triangle vertex locations and vertex normals */
@@ -91,12 +91,12 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg,
                                                      float3 N[3])
 {
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
-  N[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  N[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  N[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  N[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  N[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  N[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
 }
 
 /* Interpolate smooth vertex normal from vertices */
@@ -106,9 +106,9 @@ triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
 
   float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1);
 
@@ -120,9 +120,9 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
 
   /* ensure that the normals are in object space */
   if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
@@ -145,9 +145,9 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
 {
   /* fetch triangle vertex coordinates */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  const float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 p0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 p1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 p2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
 
   /* compute derivatives of P w.r.t. uv */
   *dPdu = (p0 - p2);
@@ -267,15 +267,15 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
 
     if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
       const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
+      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
     }
     else {
       const int tri = desc.offset + sd->prim * 3;
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+      f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
+      f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
+      f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
     }
 
 #ifdef __RAY_DIFFERENTIALS__
@@ -298,7 +298,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
     }
     else {
       return make_float3(0.0f, 0.0f, 0.0f);
@@ -318,16 +318,16 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
 
     if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
       const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+      f0 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.z);
     }
     else {
       const int tri = desc.offset + sd->prim * 3;
       if (desc.element == ATTR_ELEMENT_CORNER) {
-        f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
-        f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
-        f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
+        f0 = kernel_tex_fetch(__attributes_float4, tri + 0);
+        f1 = kernel_tex_fetch(__attributes_float4, tri + 1);
+        f2 = kernel_tex_fetch(__attributes_float4, tri + 2);
       }
       else {
         f0 = color_srgb_to_linear_v4(
@@ -359,7 +359,7 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
     }
     else {
       return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/triangle_intersect.h b/intern/cycles/kernel/geom/triangle_intersect.h
index faff8a85a93..57a6ae7fe72 100644
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -37,27 +37,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
 {
   const int prim = kernel_tex_fetch(__prim_index, prim_addr);
   const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-#else
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-#endif
   float t, u, v;
-  if (ray_triangle_intersect(P,
-                             dir,
-                             tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                             ssef_verts,
-#else
-                             float4_to_float3(tri_a),
-                             float4_to_float3(tri_b),
-                             float4_to_float3(tri_c),
-#endif
-                             &u,
-                             &v,
-                             &t)) {
+  if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
     /* Visibility flag test. we do it here under the assumption
      * that most triangles are culled by node flags.
@@ -106,27 +90,11 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
 
   const int prim = kernel_tex_fetch(__prim_index, prim_addr);
   const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-#  else
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-#  endif
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
   float t, u, v;
-  if (!ray_triangle_intersect(P,
-                              dir,
-                              tmax,
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                              ssef_verts,
-#  else
-                              tri_a,
-                              tri_b,
-                              tri_c,
-#  endif
-                              &u,
-                              &v,
-                              &t)) {
+  if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
     return false;
   }
 
@@ -178,11 +146,6 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
   isect->t = t;
 
   /* Record geometric normal. */
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-#  endif
   local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
 
   return false;
@@ -223,9 +186,9 @@ ccl_device_inline float3 triangle_refine(KernelGlobals kg,
   P = P + D * t;
 
   const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
   float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
   float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
   float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
@@ -280,9 +243,9 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals kg,
 
 #  ifdef __INTERSECTION_REFINE__
   const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
   float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
   float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
   float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
diff --git a/intern/cycles/kernel/geom/volume.h b/intern/cycles/kernel/geom/volume.h
index 4e83ad6acb3..387eb2646da 100644
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@@ -75,7 +75,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
                                           const AttributeDescriptor desc)
 {
   if (desc.element & (ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
   }
   else if (desc.element == ATTR_ELEMENT_VOXEL) {
     /* todo: optimize this so we don't have to transform both here and in
diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h
index dbaf02836e4..f4a2fbea405 100644
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -65,7 +65,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   }
 
   /* Always count the sample, even if the camera sample will reject the ray. */
-  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+  const int sample = kernel_accum_sample(
+      kg, state, render_buffer, scheduled_sample, tile->sample_offset);
 
   /* Setup render buffers. */
   const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
diff --git a/intern/cycles/kernel/integrator/init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h
index f0ba77bd9a6..59dd1a9fa75 100644
--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -89,7 +89,8 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
    * This logic allows to both count actual number of samples per pixel, and to add samples to this
    * pixel after it was converged and samples were added somewhere else (in which case the
    * `scheduled_sample` will be different from actual number of samples in this pixel). */
-  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+  const int sample = kernel_accum_sample(
+      kg, state, render_buffer, scheduled_sample, tile->sample_offset);
 
   /* Initialize random number seed for path. */
   const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h
index 31452de1ca4..a8ebbe908ae 100644
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -20,7 +20,6 @@
 #include "kernel/integrator/shader_eval.h"
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
-#include "kernel/sample/mis.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -81,8 +80,7 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
     /* multiple importance sampling, get background light pdf for ray
      * direction, and compute weight with respect to BSDF pdf */
     const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
-    const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
-
+    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
     L *= mis_weight;
   }
 #  endif
@@ -169,7 +167,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
         /* multiple importance sampling, get regular light pdf,
          * and compute weight with respect to BSDF pdf */
         const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-        const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+        const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
         light_eval *= mis_weight;
       }
 
diff --git a/intern/cycles/kernel/integrator/shade_light.h b/intern/cycles/kernel/integrator/shade_light.h
index 5abe9e98abc..97ca430752c 100644
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -84,7 +84,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
     /* multiple importance sampling, get regular light pdf,
      * and compute weight with respect to BSDF pdf */
     const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
     light_eval *= mis_weight;
   }
 
diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h
index 1de890aae29..a68fcaa7a64 100644
--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -95,8 +95,8 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
 
   shader_setup_from_volume(kg, shadow_sd, &ray);
 
-  const float step_size = volume_stack_step_size(
-      kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i));
+  const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);
 
   volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size);
 }
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 2793dd3e218..c9c586f5ae4 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -27,8 +27,6 @@
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
 
-#include "kernel/sample/mis.h"
-
 CCL_NAMESPACE_BEGIN
 
 ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
@@ -95,8 +93,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
     /* Multiple importance sampling, get triangle light pdf,
      * and compute weight with respect to BSDF pdf. */
     float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
+    float mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
     L *= mis_weight;
   }
 
@@ -155,7 +152,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
 
   if (ls.shader & SHADER_USE_MIS) {
-    const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+    const float mis_weight = light_sample_mis_weight_nee(kg, ls.pdf, bsdf_pdf);
     bsdf_eval_mul(&bsdf_eval, mis_weight);
   }
 
@@ -195,12 +192,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           bsdf_eval_pass_diffuse_weight(&bsdf_eval) :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    const float3 pass_glossy_weight = (bounce == 0) ?
-                                          bsdf_eval_pass_glossy_weight(&bsdf_eval) :
-                                          INTEGRATOR_STATE(state, path, pass_glossy_weight);
+    const packed_float3 pass_diffuse_weight =
+        (bounce == 0) ? packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval)) :
+                        INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    const packed_float3 pass_glossy_weight = (bounce == 0) ?
+                                                 packed_float3(
+                                                     bsdf_eval_pass_glossy_weight(&bsdf_eval)) :
+                                                 INTEGRATOR_STATE(state, path, pass_glossy_weight);
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
   }
diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index cc47557d580..712c22357b8 100644
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -27,8 +27,6 @@
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
 
-#include "kernel/sample/mis.h"
-
 CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME__
@@ -78,9 +76,8 @@ ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
                                                    ccl_private ShaderData *ccl_restrict sd,
                                                    ccl_private float3 *ccl_restrict extinction)
 {
-  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, [=](const int i) {
-    return integrator_state_read_shadow_volume_stack(state, i);
-  });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
+  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);
 
   if (!(sd->flag & SD_EXTINCTION)) {
     return false;
@@ -98,9 +95,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
                                             ccl_private VolumeShaderCoefficients *coeff)
 {
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  shader_eval_volume<false>(kg, state, sd, path_flag, [=](const int i) {
-    return integrator_state_read_volume_stack(state, i);
-  });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  shader_eval_volume<false>(kg, state, sd, path_flag, volume_read_lambda_pass);
 
   if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
     return false;
@@ -772,7 +768,7 @@ ccl_device_forceinline void integrate_volume_direct_light(
   const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
 
   if (ls->shader & SHADER_USE_MIS) {
-    float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+    float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf);
     bsdf_eval_mul(&phase_eval, mis_weight);
   }
 
@@ -805,9 +801,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
   const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           one_float3() :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    const packed_float3 pass_diffuse_weight = (bounce == 0) ?
+                                                  packed_float3(one_float3()) :
+                                                  INTEGRATOR_STATE(
+                                                      state, path, pass_diffuse_weight);
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
   }
@@ -932,8 +929,8 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
                                                 VOLUME_SAMPLE_DISTANCE;
 
   /* Step through volume. */
-  const float step_size = volume_stack_step_size(
-      kg, [=](const int i) { return integrator_state_read_volume_stack(state, i); });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);
 
   /* TODO: expensive to zero closures? */
   VolumeIntegrateResult result = {};
diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h
index 667ab88c8c4..625a429d3db 100644
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -40,15 +40,15 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T
 /* enum PathRayFlag */
 KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput for shadow pass. */
 KERNEL_STRUCT_MEMBER(shadow_path,
-                     float3,
+                     packed_float3,
                      unshadowed_throughput,
                      KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_END(shadow_path)
@@ -56,8 +56,8 @@ KERNEL_STRUCT_END(shadow_path)
 /********************************** Shadow Ray *******************************/
 
 KERNEL_STRUCT_BEGIN(shadow_ray)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h
index 3299f973713..bd18a7498a3 100644
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -59,12 +59,12 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 /* Continuation probability for path termination. */
 KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
-KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
 /* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
 KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
@@ -73,8 +73,8 @@ KERNEL_STRUCT_END(path)
 /************************************** Ray ***********************************/
 
 KERNEL_STRUCT_BEGIN(ray)
-KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
@@ -96,10 +96,10 @@ KERNEL_STRUCT_END(isect)
 /*************** Subsurface closure state for subsurface kernel ***************/
 
 KERNEL_STRUCT_BEGIN(subsurface)
-KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, Ng, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_END(subsurface)
 
 /********************************** Volume Stack ******************************/
diff --git a/intern/cycles/kernel/integrator/volume_stack.h b/intern/cycles/kernel/integrator/volume_stack.h
index cf69826ffff..ea3fa901e2d 100644
--- a/intern/cycles/kernel/integrator/volume_stack.h
+++ b/intern/cycles/kernel/integrator/volume_stack.h
@@ -18,6 +18,14 @@
 
 CCL_NAMESPACE_BEGIN
 
+/* Volumetric read/write lambda functions - default implementations */
+#ifndef VOLUME_READ_LAMBDA
+#  define VOLUME_READ_LAMBDA(function_call) \
+    auto volume_read_lambda_pass = [=](const int i) { return function_call; };
+#  define VOLUME_WRITE_LAMBDA(function_call) \
+    auto volume_write_lambda_pass = [=](const int i, VolumeStack entry) { function_call; };
+#endif
+
 /* Volume Stack
  *
  * This is an array of object/shared ID's that the current segment of the path
@@ -88,26 +96,18 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
                                         IntegratorState state,
                                         ccl_private const ShaderData *sd)
 {
-  volume_stack_enter_exit(
-      kg,
-      sd,
-      [=](const int i) { return integrator_state_read_volume_stack(state, i); },
-      [=](const int i, const VolumeStack entry) {
-        integrator_state_write_volume_stack(state, i, entry);
-      });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  VOLUME_WRITE_LAMBDA(integrator_state_write_volume_stack(state, i, entry))
+  volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass);
 }
 
 ccl_device void shadow_volume_stack_enter_exit(KernelGlobals kg,
                                                IntegratorShadowState state,
                                                ccl_private const ShaderData *sd)
 {
-  volume_stack_enter_exit(
-      kg,
-      sd,
-      [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); },
-      [=](const int i, const VolumeStack entry) {
-        integrator_state_write_shadow_volume_stack(state, i, entry);
-      });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
+  VOLUME_WRITE_LAMBDA(integrator_state_write_shadow_volume_stack(state, i, entry))
+  volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass);
 }
 
 /* Clean stack after the last bounce.
diff --git a/intern/cycles/kernel/light/light.h b/intern/cycles/kernel/light/light.h
index 97dca936552..e0a9f1c57f5 100644
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -676,19 +676,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg,
     ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);
 
     /* calculate intersection with the planar triangle */
-    if (!ray_triangle_intersect(P,
-                                ls->D,
-                                FLT_MAX,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                                (ssef *)V,
-#else
-                                V[0],
-                                V[1],
-                                V[2],
-#endif
-                                &ls->u,
-                                &ls->v,
-                                &ls->t)) {
+    if (!ray_triangle_intersect(P, ls->D, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
       ls->pdf = 0.0f;
       return;
     }
diff --git a/intern/cycles/kernel/light/sample.h b/intern/cycles/kernel/light/sample.h
index 6b643a95250..ff5d43ed8cd 100644
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -22,6 +22,7 @@
 #include "kernel/light/light.h"
 
 #include "kernel/sample/mapping.h"
+#include "kernel/sample/mis.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -268,4 +269,36 @@ ccl_device_inline void light_sample_to_volume_shadow_ray(
   shadow_ray_setup(sd, ls, P, ray);
 }
 
+ccl_device_inline float light_sample_mis_weight_forward(KernelGlobals kg,
+                                                        const float forward_pdf,
+                                                        const float nee_pdf)
+{
+#ifdef WITH_CYCLES_DEBUG
+  if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) {
+    return 1.0f;
+  }
+  else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) {
+    return 0.0f;
+  }
+  else
+#endif
+    return power_heuristic(forward_pdf, nee_pdf);
+}
+
+ccl_device_inline float light_sample_mis_weight_nee(KernelGlobals kg,
+                                                    const float nee_pdf,
+                                                    const float forward_pdf)
+{
+#ifdef WITH_CYCLES_DEBUG
+  if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) {
+    return 0.0f;
+  }
+  else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) {
+    return 1.0f;
+  }
+  else
+#endif
+    return power_heuristic(nee_pdf, forward_pdf);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index f226c95766f..90e16bd70d4 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -55,7 +55,7 @@ if(APPLE)
   # Disable allocation warning on macOS prior to 10.14: the OSLRenderServices
   # contains member which is 64 bytes aligned (cache inside of OIIO's
   # unordered_map_concurrent). This is not something what the SDK supportsm, but
-  # since we take care of allocations ourselves is is OK to ignore the
+  # since we take care of allocations ourselves is OK to ignore the
   # diagnostic message.
   string(APPEND CMAKE_CXX_FLAGS " -faligned-allocation")
 endif()
diff --git a/intern/cycles/kernel/sample/lcg.h b/intern/cycles/kernel/sample/lcg.h
index 92cfff639b4..e8c4915813e 100644
--- a/intern/cycles/kernel/sample/lcg.h
+++ b/intern/cycles/kernel/sample/lcg.h
@@ -19,14 +19,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Linear Congruential Generator */
 
-ccl_device uint lcg_step_uint(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device uint lcg_step_uint(T rng)
 {
   /* implicit mod 2^32 */
   *rng = (1103515245 * (*rng) + 12345);
   return *rng;
 }
 
-ccl_device float lcg_step_float(uint *rng)
+/* This is templated to handle multiple address spaces on Metal. */
+template<class T> ccl_device float lcg_step_float(T rng)
 {
   /* implicit mod 2^32 */
   *rng = (1103515245 * (*rng) + 12345);
diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h
index 0c27992c7f6..adc8493badd 100644
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -163,18 +163,7 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
     /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
      * We can use this to get divide sample sequence into two classes for easier variance
      * estimation. */
-#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
-    return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
-#elif defined(__NVCC__)
-    return __popc(sample & 0xaaaaaaaa) & 1;
-#else
-    /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
-    int i = sample & 0xaaaaaaaa;
-    i = i - ((i >> 1) & 0x55555555);
-    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
-    i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
-    return i & 1;
-#endif
+    return popcount(uint(sample) & 0xaaaaaaaa) & 1;
   }
   else {
     /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
diff --git a/intern/cycles/kernel/svm/math_util.h b/intern/cycles/kernel/svm/math_util.h
index b2e539cdd1f..20817cd0fd3 100644
--- a/intern/cycles/kernel/svm/math_util.h
+++ b/intern/cycles/kernel/svm/math_util.h
@@ -212,33 +212,6 @@ ccl_device float3 svm_math_blackbody_color(float t)
    * which is enough to get the same 8 bit/channel color.
    */
 
-  const float blackbody_table_r[6][3] = {
-      {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f},
-      {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f},
-      {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f},
-      {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f},
-      {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f},
-      {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f},
-  };
-
-  const float blackbody_table_g[6][3] = {
-      {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f},
-      {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f},
-      {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f},
-      {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f},
-      {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f},
-      {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f},
-  };
-
-  const float blackbody_table_b[6][4] = {
-      {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */
-      {0.0f, 0.0f, 0.0f, 0.0f},
-      {0.0f, 0.0f, 0.0f, 0.0f},
-      {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f},
-      {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f},
-      {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f},
-  };
-
   if (t >= 12000.0f) {
     return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
   }
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 62ba5bf04e3..ce32e1a520f 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -220,7 +220,7 @@ CCL_NAMESPACE_BEGIN
 template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
 ccl_device void svm_eval_nodes(KernelGlobals kg,
                                ConstIntegratorGenericState state,
-                               ShaderData *sd,
+                               ccl_private ShaderData *sd,
                                ccl_global float *render_buffer,
                                uint32_t path_flag)
 {
diff --git a/intern/cycles/kernel/svm/wavelength.h b/intern/cycles/kernel/svm/wavelength.h
index 28fd172abc7..6e25224243f 100644
--- a/intern/cycles/kernel/svm/wavelength.h
+++ b/intern/cycles/kernel/svm/wavelength.h
@@ -42,41 +42,6 @@ ccl_device_noinline void svm_node_wavelength(KernelGlobals kg,
                                              uint wavelength,
                                              uint color_out)
 {
-  // CIE colour matching functions xBar, yBar, and zBar for
-  //   wavelengths from 380 through 780 nanometers, every 5
-  //   nanometers.  For a wavelength lambda in this range:
-  //        cie_colour_match[(lambda - 380) / 5][0] = xBar
-  //        cie_colour_match[(lambda - 380) / 5][1] = yBar
-  //        cie_colour_match[(lambda - 380) / 5][2] = zBar
-  const float cie_colour_match[81][3] = {
-      {0.0014f, 0.0000f, 0.0065f}, {0.0022f, 0.0001f, 0.0105f}, {0.0042f, 0.0001f, 0.0201f},
-      {0.0076f, 0.0002f, 0.0362f}, {0.0143f, 0.0004f, 0.0679f}, {0.0232f, 0.0006f, 0.1102f},
-      {0.0435f, 0.0012f, 0.2074f}, {0.0776f, 0.0022f, 0.3713f}, {0.1344f, 0.0040f, 0.6456f},
-      {0.2148f, 0.0073f, 1.0391f}, {0.2839f, 0.0116f, 1.3856f}, {0.3285f, 0.0168f, 1.6230f},
-      {0.3483f, 0.0230f, 1.7471f}, {0.3481f, 0.0298f, 1.7826f}, {0.3362f, 0.0380f, 1.7721f},
-      {0.3187f, 0.0480f, 1.7441f}, {0.2908f, 0.0600f, 1.6692f}, {0.2511f, 0.0739f, 1.5281f},
-      {0.1954f, 0.0910f, 1.2876f}, {0.1421f, 0.1126f, 1.0419f}, {0.0956f, 0.1390f, 0.8130f},
-      {0.0580f, 0.1693f, 0.6162f}, {0.0320f, 0.2080f, 0.4652f}, {0.0147f, 0.2586f, 0.3533f},
-      {0.0049f, 0.3230f, 0.2720f}, {0.0024f, 0.4073f, 0.2123f}, {0.0093f, 0.5030f, 0.1582f},
-      {0.0291f, 0.6082f, 0.1117f}, {0.0633f, 0.7100f, 0.0782f}, {0.1096f, 0.7932f, 0.0573f},
-      {0.1655f, 0.8620f, 0.0422f}, {0.2257f, 0.9149f, 0.0298f}, {0.2904f, 0.9540f, 0.0203f},
-      {0.3597f, 0.9803f, 0.0134f}, {0.4334f, 0.9950f, 0.0087f}, {0.5121f, 1.0000f, 0.0057f},
-      {0.5945f, 0.9950f, 0.0039f}, {0.6784f, 0.9786f, 0.0027f}, {0.7621f, 0.9520f, 0.0021f},
-      {0.8425f, 0.9154f, 0.0018f}, {0.9163f, 0.8700f, 0.0017f}, {0.9786f, 0.8163f, 0.0014f},
-      {1.0263f, 0.7570f, 0.0011f}, {1.0567f, 0.6949f, 0.0010f}, {1.0622f, 0.6310f, 0.0008f},
-      {1.0456f, 0.5668f, 0.0006f}, {1.0026f, 0.5030f, 0.0003f}, {0.9384f, 0.4412f, 0.0002f},
-      {0.8544f, 0.3810f, 0.0002f}, {0.7514f, 0.3210f, 0.0001f}, {0.6424f, 0.2650f, 0.0000f},
-      {0.5419f, 0.2170f, 0.0000f}, {0.4479f, 0.1750f, 0.0000f}, {0.3608f, 0.1382f, 0.0000f},
-      {0.2835f, 0.1070f, 0.0000f}, {0.2187f, 0.0816f, 0.0000f}, {0.1649f, 0.0610f, 0.0000f},
-      {0.1212f, 0.0446f, 0.0000f}, {0.0874f, 0.0320f, 0.0000f}, {0.0636f, 0.0232f, 0.0000f},
-      {0.0468f, 0.0170f, 0.0000f}, {0.0329f, 0.0119f, 0.0000f}, {0.0227f, 0.0082f, 0.0000f},
-      {0.0158f, 0.0057f, 0.0000f}, {0.0114f, 0.0041f, 0.0000f}, {0.0081f, 0.0029f, 0.0000f},
-      {0.0058f, 0.0021f, 0.0000f}, {0.0041f, 0.0015f, 0.0000f}, {0.0029f, 0.0010f, 0.0000f},
-      {0.0020f, 0.0007f, 0.0000f}, {0.0014f, 0.0005f, 0.0000f}, {0.0010f, 0.0004f, 0.0000f},
-      {0.0007f, 0.0002f, 0.0000f}, {0.0005f, 0.0002f, 0.0000f}, {0.0003f, 0.0001f, 0.0000f},
-      {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
-      {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
-
   float lambda_nm = stack_load_float(stack, wavelength);
   float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f);  // scaled 0..80
   int i = float_to_int(ii);
diff --git a/intern/cycles/kernel/tables.h b/intern/cycles/kernel/tables.h
new file mode 100644
index 00000000000..768033d4ffe
--- /dev/null
+++ b/intern/cycles/kernel/tables.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* clang-format off */
+
+ccl_inline_constant float blackbody_table_r[][3] = {
+  {2.52432244e+03f, -1.06185848e-03f, 3.11067539e+00f},
+  {3.37763626e+03f, -4.34581697e-04f, 1.64843306e+00f},
+  {4.10671449e+03f, -8.61949938e-05f, 6.41423749e-01f},
+  {4.66849800e+03f, 2.85655028e-05f, 1.29075375e-01f},
+  {4.60124770e+03f, 2.89727618e-05f, 1.48001316e-01f},
+  {3.78765709e+03f, 9.36026367e-06f, 3.98995841e-01f}
+};
+
+ccl_inline_constant float blackbody_table_g[][3] = {
+  {-7.50343014e+02f, 3.15679613e-04f, 4.73464526e-01f},
+  {-1.00402363e+03f, 1.29189794e-04f, 9.08181524e-01f},
+  {-1.22075471e+03f, 2.56245413e-05f, 1.20753416e+00f},
+  {-1.42546105e+03f, -4.01730887e-05f, 1.44002695e+00f},
+  {-1.18134453e+03f, -2.18913373e-05f, 1.30656109e+00f},
+  {-5.00279505e+02f, -4.59745390e-06f, 1.09090465e+00f}
+};
+
+ccl_inline_constant float blackbody_table_b[][4] = {
+  {0.0f, 0.0f, 0.0f, 0.0f}, /* zeros should be optimized by compiler */
+  {0.0f, 0.0f, 0.0f, 0.0f},
+  {0.0f, 0.0f, 0.0f, 0.0f},
+  {-2.02524603e-11f, 1.79435860e-07f, -2.60561875e-04f, -1.41761141e-02f},
+  {-2.22463426e-13f, -1.55078698e-08f, 3.81675160e-04f, -7.30646033e-01f},
+  {6.72595954e-13f, -2.73059993e-08f, 4.24068546e-04f, -7.52204323e-01f}
+};
+
+ccl_inline_constant float cie_colour_match[][3] = {
+  {0.0014f, 0.0000f, 0.0065f}, {0.0022f, 0.0001f, 0.0105f}, {0.0042f, 0.0001f, 0.0201f},
+  {0.0076f, 0.0002f, 0.0362f}, {0.0143f, 0.0004f, 0.0679f}, {0.0232f, 0.0006f, 0.1102f},
+  {0.0435f, 0.0012f, 0.2074f}, {0.0776f, 0.0022f, 0.3713f}, {0.1344f, 0.0040f, 0.6456f},
+  {0.2148f, 0.0073f, 1.0391f}, {0.2839f, 0.0116f, 1.3856f}, {0.3285f, 0.0168f, 1.6230f},
+  {0.3483f, 0.0230f, 1.7471f}, {0.3481f, 0.0298f, 1.7826f}, {0.3362f, 0.0380f, 1.7721f},
+  {0.3187f, 0.0480f, 1.7441f}, {0.2908f, 0.0600f, 1.6692f}, {0.2511f, 0.0739f, 1.5281f},
+  {0.1954f, 0.0910f, 1.2876f}, {0.1421f, 0.1126f, 1.0419f}, {0.0956f, 0.1390f, 0.8130f},
+  {0.0580f, 0.1693f, 0.6162f}, {0.0320f, 0.2080f, 0.4652f}, {0.0147f, 0.2586f, 0.3533f},
+  {0.0049f, 0.3230f, 0.2720f}, {0.0024f, 0.4073f, 0.2123f}, {0.0093f, 0.5030f, 0.1582f},
+  {0.0291f, 0.6082f, 0.1117f}, {0.0633f, 0.7100f, 0.0782f}, {0.1096f, 0.7932f, 0.0573f},
+  {0.1655f, 0.8620f, 0.0422f}, {0.2257f, 0.9149f, 0.0298f}, {0.2904f, 0.9540f, 0.0203f},
+  {0.3597f, 0.9803f, 0.0134f}, {0.4334f, 0.9950f, 0.0087f}, {0.5121f, 1.0000f, 0.0057f},
+  {0.5945f, 0.9950f, 0.0039f}, {0.6784f, 0.9786f, 0.0027f}, {0.7621f, 0.9520f, 0.0021f},
+  {0.8425f, 0.9154f, 0.0018f}, {0.9163f, 0.8700f, 0.0017f}, {0.9786f, 0.8163f, 0.0014f},
+  {1.0263f, 0.7570f, 0.0011f}, {1.0567f, 0.6949f, 0.0010f}, {1.0622f, 0.6310f, 0.0008f},
+  {1.0456f, 0.5668f, 0.0006f}, {1.0026f, 0.5030f, 0.0003f}, {0.9384f, 0.4412f, 0.0002f},
+  {0.8544f, 0.3810f, 0.0002f}, {0.7514f, 0.3210f, 0.0001f}, {0.6424f, 0.2650f, 0.0000f},
+  {0.5419f, 0.2170f, 0.0000f}, {0.4479f, 0.1750f, 0.0000f}, {0.3608f, 0.1382f, 0.0000f},
+  {0.2835f, 0.1070f, 0.0000f}, {0.2187f, 0.0816f, 0.0000f}, {0.1649f, 0.0610f, 0.0000f},
+  {0.1212f, 0.0446f, 0.0000f}, {0.0874f, 0.0320f, 0.0000f}, {0.0636f, 0.0232f, 0.0000f},
+  {0.0468f, 0.0170f, 0.0000f}, {0.0329f, 0.0119f, 0.0000f}, {0.0227f, 0.0082f, 0.0000f},
+  {0.0158f, 0.0057f, 0.0000f}, {0.0114f, 0.0041f, 0.0000f}, {0.0081f, 0.0029f, 0.0000f},
+  {0.0058f, 0.0021f, 0.0000f}, {0.0041f, 0.0015f, 0.0000f}, {0.0029f, 0.0010f, 0.0000f},
+  {0.0020f, 0.0007f, 0.0000f}, {0.0014f, 0.0005f, 0.0000f}, {0.0010f, 0.0004f, 0.0000f},
+  {0.0007f, 0.0002f, 0.0000f}, {0.0005f, 0.0002f, 0.0000f}, {0.0003f, 0.0001f, 0.0000f},
+  {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
+  {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}
+};
+
+/* clang-format on */
+\ No newline at end of file
diff --git a/intern/cycles/kernel/textures.h b/intern/cycles/kernel/textures.h
index 464ecb183cb..2e3ae29a19a 100644
--- a/intern/cycles/kernel/textures.h
+++ b/intern/cycles/kernel/textures.h
@@ -40,11 +40,11 @@ KERNEL_TEX(DecomposedTransform, __camera_motion)
 
 /* triangles */
 KERNEL_TEX(uint, __tri_shader)
-KERNEL_TEX(float4, __tri_vnormal)
+KERNEL_TEX(packed_float3, __tri_vnormal)
 KERNEL_TEX(uint4, __tri_vindex)
 KERNEL_TEX(uint, __tri_patch)
 KERNEL_TEX(float2, __tri_patch_uv)
-KERNEL_TEX(float4, __tri_verts)
+KERNEL_TEX(packed_float3, __tri_verts)
 
 /* curves */
 KERNEL_TEX(KernelCurve, __curves)
@@ -58,7 +58,8 @@ KERNEL_TEX(uint, __patches)
 KERNEL_TEX(uint4, __attributes_map)
 KERNEL_TEX(float, __attributes_float)
 KERNEL_TEX(float2, __attributes_float2)
-KERNEL_TEX(float4, __attributes_float3)
+KERNEL_TEX(packed_float3, __attributes_float3)
+KERNEL_TEX(float4, __attributes_float4)
 KERNEL_TEX(uchar4, __attributes_uchar4)
 
 /* lights */
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 2f6cadf7496..4a730dbfaaa 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -36,13 +36,6 @@
 #  define __KERNEL_CPU__
 #endif
 
-/* TODO(sergey): This is only to make it possible to include this header
- * from outside of the kernel. but this could be done somewhat cleaner?
- */
-#ifndef ccl_addr_space
-#  define ccl_addr_space
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 /* Constants */
@@ -489,6 +482,16 @@ enum PanoramaType {
   PANORAMA_NUM_TYPES,
 };
 
+/* Direct Light Sampling */
+
+enum DirectLightSamplingType {
+  DIRECT_LIGHT_SAMPLING_MIS = 0,
+  DIRECT_LIGHT_SAMPLING_FORWARD = 1,
+  DIRECT_LIGHT_SAMPLING_NEE = 2,
+
+  DIRECT_LIGHT_SAMPLING_NUM,
+};
+
 /* Differential */
 
 typedef struct differential3 {
@@ -1201,8 +1204,11 @@ typedef struct KernelIntegrator {
   /* Closure filter. */
   int filter_closures;
 
+  /* MIS debuging */
+  int direct_light_sampling_type;
+
   /* padding */
-  int pad1, pad2, pad3;
+  int pad1, pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1426,6 +1432,7 @@ typedef struct KernelWorkTile {
 
   uint start_sample;
   uint num_samples;
+  uint sample_offset;
 
   int offset;
   uint stride;
diff --git a/intern/cycles/scene/attribute.cpp b/intern/cycles/scene/attribute.cpp
index 3401eea307f..6d15f3325f7 100644
--- a/intern/cycles/scene/attribute.cpp
+++ b/intern/cycles/scene/attribute.cpp
@@ -404,6 +404,10 @@ AttrKernelDataType Attribute::kernel_type(const Attribute &attr)
     return AttrKernelDataType::FLOAT2;
   }
 
+  if (attr.type == TypeFloat4 || attr.type == TypeRGBA || attr.type == TypeDesc::TypeMatrix) {
+    return AttrKernelDataType::FLOAT4;
+  }
+
   return AttrKernelDataType::FLOAT3;
 }
 
@@ -585,7 +589,7 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
         attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE);
         break;
       case ATTR_STD_MOTION_VERTEX_POSITION:
-        attr = add(name, TypeDesc::TypePoint, ATTR_ELEMENT_CURVE_KEY_MOTION);
+        attr = add(name, TypeDesc::TypeFloat4, ATTR_ELEMENT_CURVE_KEY_MOTION);
         break;
       case ATTR_STD_CURVE_INTERCEPT:
         attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY);
diff --git a/intern/cycles/scene/attribute.h b/intern/cycles/scene/attribute.h
index 4a25a900c14..612a0b7c80d 100644
--- a/intern/cycles/scene/attribute.h
+++ b/intern/cycles/scene/attribute.h
@@ -47,12 +47,7 @@ struct Transform;
  *
  * The values of this enumeration are also used as flags to detect changes in AttributeSet. */
 
-enum AttrKernelDataType {
-  FLOAT = 0,
-  FLOAT2 = 1,
-  FLOAT3 = 2,
-  UCHAR4 = 3,
-};
+enum AttrKernelDataType { FLOAT = 0, FLOAT2 = 1, FLOAT3 = 2, FLOAT4 = 3, UCHAR4 = 4, NUM = 5 };
 
 /* Attribute
  *
diff --git a/intern/cycles/scene/geometry.cpp b/intern/cycles/scene/geometry.cpp
index 8a3fc522d22..bf426fc49f6 100644
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -551,6 +551,7 @@ static void update_attribute_element_size(Geometry *geom,
                                           size_t *attr_float_size,
                                           size_t *attr_float2_size,
                                           size_t *attr_float3_size,
+                                          size_t *attr_float4_size,
                                           size_t *attr_uchar4_size)
 {
   if (mattr) {
@@ -569,7 +570,10 @@ static void update_attribute_element_size(Geometry *geom,
       *attr_float2_size += size;
     }
     else if (mattr->type == TypeDesc::TypeMatrix) {
-      *attr_float3_size += size * 4;
+      *attr_float4_size += size * 4;
+    }
+    else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) {
+      *attr_float4_size += size;
     }
     else {
       *attr_float3_size += size;
@@ -582,8 +586,10 @@ void GeometryManager::update_attribute_element_offset(Geometry *geom,
                                                       size_t &attr_float_offset,
                                                       device_vector<float2> &attr_float2,
                                                       size_t &attr_float2_offset,
-                                                      device_vector<float4> &attr_float3,
+                                                      device_vector<packed_float3> &attr_float3,
                                                       size_t &attr_float3_offset,
+                                                      device_vector<float4> &attr_float4,
+                                                      size_t &attr_float4_offset,
                                                       device_vector<uchar4> &attr_uchar4,
                                                       size_t &attr_uchar4_offset,
                                                       Attribute *mattr,
@@ -646,18 +652,30 @@ void GeometryManager::update_attribute_element_offset(Geometry *geom,
     }
     else if (mattr->type == TypeDesc::TypeMatrix) {
       Transform *tfm = mattr->data_transform();
-      offset = attr_float3_offset;
+      offset = attr_float4_offset;
 
-      assert(attr_float3.size() >= offset + size * 3);
+      assert(attr_float4.size() >= offset + size * 3);
       if (mattr->modified) {
         for (size_t k = 0; k < size * 3; k++) {
-          attr_float3[offset + k] = (&tfm->x)[k];
+          attr_float4[offset + k] = (&tfm->x)[k];
         }
       }
-      attr_float3_offset += size * 3;
+      attr_float4_offset += size * 3;
     }
-    else {
+    else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) {
       float4 *data = mattr->data_float4();
+      offset = attr_float4_offset;
+
+      assert(attr_float4.size() >= offset + size);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size; k++) {
+          attr_float4[offset + k] = data[k];
+        }
+      }
+      attr_float4_offset += size;
+    }
+    else {
+      float3 *data = mattr->data_float3();
       offset = attr_float3_offset;
 
       assert(attr_float3.size() >= offset + size);
@@ -783,6 +801,7 @@ void GeometryManager::device_update_attributes(Device *device,
   size_t attr_float_size = 0;
   size_t attr_float2_size = 0;
   size_t attr_float3_size = 0;
+  size_t attr_float4_size = 0;
   size_t attr_uchar4_size = 0;
 
   for (size_t i = 0; i < scene->geometry.size(); i++) {
@@ -797,6 +816,7 @@ void GeometryManager::device_update_attributes(Device *device,
                                     &attr_float_size,
                                     &attr_float2_size,
                                     &attr_float3_size,
+                                    &attr_float4_size,
                                     &attr_uchar4_size);
 
       if (geom->is_mesh()) {
@@ -809,6 +829,7 @@ void GeometryManager::device_update_attributes(Device *device,
                                       &attr_float_size,
                                       &attr_float2_size,
                                       &attr_float3_size,
+                                      &attr_float4_size,
                                       &attr_uchar4_size);
       }
     }
@@ -824,6 +845,7 @@ void GeometryManager::device_update_attributes(Device *device,
                                     &attr_float_size,
                                     &attr_float2_size,
                                     &attr_float3_size,
+                                    &attr_float4_size,
                                     &attr_uchar4_size);
     }
   }
@@ -831,19 +853,22 @@ void GeometryManager::device_update_attributes(Device *device,
   dscene->attributes_float.alloc(attr_float_size);
   dscene->attributes_float2.alloc(attr_float2_size);
   dscene->attributes_float3.alloc(attr_float3_size);
+  dscene->attributes_float4.alloc(attr_float4_size);
   dscene->attributes_uchar4.alloc(attr_uchar4_size);
 
   /* The order of those flags needs to match that of AttrKernelDataType. */
-  const bool attributes_need_realloc[4] = {
+  const bool attributes_need_realloc[AttrKernelDataType::NUM] = {
       dscene->attributes_float.need_realloc(),
       dscene->attributes_float2.need_realloc(),
       dscene->attributes_float3.need_realloc(),
+      dscene->attributes_float4.need_realloc(),
       dscene->attributes_uchar4.need_realloc(),
   };
 
   size_t attr_float_offset = 0;
   size_t attr_float2_offset = 0;
   size_t attr_float3_offset = 0;
+  size_t attr_float4_offset = 0;
   size_t attr_uchar4_offset = 0;
 
   /* Fill in attributes. */
@@ -868,6 +893,8 @@ void GeometryManager::device_update_attributes(Device *device,
                                       attr_float2_offset,
                                       dscene->attributes_float3,
                                       attr_float3_offset,
+                                      dscene->attributes_float4,
+                                      attr_float4_offset,
                                       dscene->attributes_uchar4,
                                       attr_uchar4_offset,
                                       attr,
@@ -891,6 +918,8 @@ void GeometryManager::device_update_attributes(Device *device,
                                         attr_float2_offset,
                                         dscene->attributes_float3,
                                         attr_float3_offset,
+                                        dscene->attributes_float4,
+                                        attr_float4_offset,
                                         dscene->attributes_uchar4,
                                         attr_uchar4_offset,
                                         subd_attr,
@@ -923,6 +952,8 @@ void GeometryManager::device_update_attributes(Device *device,
                                       attr_float2_offset,
                                       dscene->attributes_float3,
                                       attr_float3_offset,
+                                      dscene->attributes_float4,
+                                      attr_float4_offset,
                                       dscene->attributes_uchar4,
                                       attr_uchar4_offset,
                                       attr,
@@ -954,6 +985,7 @@ void GeometryManager::device_update_attributes(Device *device,
   dscene->attributes_float.copy_to_device_if_modified();
   dscene->attributes_float2.copy_to_device_if_modified();
   dscene->attributes_float3.copy_to_device_if_modified();
+  dscene->attributes_float4.copy_to_device_if_modified();
   dscene->attributes_uchar4.copy_to_device_if_modified();
 
   if (progress.get_cancel())
@@ -1080,9 +1112,9 @@ void GeometryManager::device_update_mesh(Device *,
     /* normals */
     progress.set_status("Updating Mesh", "Computing normals");
 
-    float4 *tri_verts = dscene->tri_verts.alloc(tri_size * 3);
+    packed_float3 *tri_verts = dscene->tri_verts.alloc(tri_size * 3);
     uint *tri_shader = dscene->tri_shader.alloc(tri_size);
-    float4 *vnormal = dscene->tri_vnormal.alloc(vert_size);
+    packed_float3 *vnormal = dscene->tri_vnormal.alloc(vert_size);
     uint4 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
     uint *tri_patch = dscene->tri_patch.alloc(tri_size);
     float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
@@ -1293,18 +1325,21 @@ enum {
   ATTR_FLOAT_MODIFIED = (1 << 2),
   ATTR_FLOAT2_MODIFIED = (1 << 3),
   ATTR_FLOAT3_MODIFIED = (1 << 4),
-  ATTR_UCHAR4_MODIFIED = (1 << 5),
+  ATTR_FLOAT4_MODIFIED = (1 << 5),
+  ATTR_UCHAR4_MODIFIED = (1 << 6),
 
-  CURVE_DATA_NEED_REALLOC = (1 << 6),
-  MESH_DATA_NEED_REALLOC = (1 << 7),
+  CURVE_DATA_NEED_REALLOC = (1 << 7),
+  MESH_DATA_NEED_REALLOC = (1 << 8),
 
-  ATTR_FLOAT_NEEDS_REALLOC = (1 << 8),
-  ATTR_FLOAT2_NEEDS_REALLOC = (1 << 9),
-  ATTR_FLOAT3_NEEDS_REALLOC = (1 << 10),
-  ATTR_UCHAR4_NEEDS_REALLOC = (1 << 11),
+  ATTR_FLOAT_NEEDS_REALLOC = (1 << 9),
+  ATTR_FLOAT2_NEEDS_REALLOC = (1 << 10),
+  ATTR_FLOAT3_NEEDS_REALLOC = (1 << 11),
+  ATTR_FLOAT4_NEEDS_REALLOC = (1 << 12),
+  ATTR_UCHAR4_NEEDS_REALLOC = (1 << 13),
 
   ATTRS_NEED_REALLOC = (ATTR_FLOAT_NEEDS_REALLOC | ATTR_FLOAT2_NEEDS_REALLOC |
-                        ATTR_FLOAT3_NEEDS_REALLOC | ATTR_UCHAR4_NEEDS_REALLOC),
+                        ATTR_FLOAT3_NEEDS_REALLOC | ATTR_FLOAT4_NEEDS_REALLOC |
+                        ATTR_UCHAR4_NEEDS_REALLOC),
   DEVICE_MESH_DATA_NEEDS_REALLOC = (MESH_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC),
   DEVICE_CURVE_DATA_NEEDS_REALLOC = (CURVE_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC),
 };
@@ -1332,10 +1367,17 @@ static void update_device_flags_attribute(uint32_t &device_update_flags,
         device_update_flags |= ATTR_FLOAT3_MODIFIED;
         break;
       }
+      case AttrKernelDataType::FLOAT4: {
+        device_update_flags |= ATTR_FLOAT4_MODIFIED;
+        break;
+      }
       case AttrKernelDataType::UCHAR4: {
         device_update_flags |= ATTR_UCHAR4_MODIFIED;
         break;
       }
+      case AttrKernelDataType::NUM: {
+        break;
+      }
     }
   }
 }
@@ -1352,6 +1394,9 @@ static void update_attribute_realloc_flags(uint32_t &device_update_flags,
   if (attributes.modified(AttrKernelDataType::FLOAT3)) {
     device_update_flags |= ATTR_FLOAT3_NEEDS_REALLOC;
   }
+  if (attributes.modified(AttrKernelDataType::FLOAT4)) {
+    device_update_flags |= ATTR_FLOAT4_NEEDS_REALLOC;
+  }
   if (attributes.modified(AttrKernelDataType::UCHAR4)) {
     device_update_flags |= ATTR_UCHAR4_NEEDS_REALLOC;
   }
@@ -1553,6 +1598,14 @@ void GeometryManager::device_update_preprocess(Device *device, Scene *scene, Pro
     dscene->attributes_float3.tag_modified();
   }
 
+  if (device_update_flags & ATTR_FLOAT4_NEEDS_REALLOC) {
+    dscene->attributes_map.tag_realloc();
+    dscene->attributes_float4.tag_realloc();
+  }
+  else if (device_update_flags & ATTR_FLOAT4_MODIFIED) {
+    dscene->attributes_float4.tag_modified();
+  }
+
   if (device_update_flags & ATTR_UCHAR4_NEEDS_REALLOC) {
     dscene->attributes_map.tag_realloc();
     dscene->attributes_uchar4.tag_realloc();
@@ -2014,6 +2067,7 @@ void GeometryManager::device_update(Device *device,
   dscene->attributes_float.clear_modified();
   dscene->attributes_float2.clear_modified();
   dscene->attributes_float3.clear_modified();
+  dscene->attributes_float4.clear_modified();
   dscene->attributes_uchar4.clear_modified();
 }
 
@@ -2041,6 +2095,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc
   dscene->attributes_float.free_if_need_realloc(force_free);
   dscene->attributes_float2.free_if_need_realloc(force_free);
   dscene->attributes_float3.free_if_need_realloc(force_free);
+  dscene->attributes_float4.free_if_need_realloc(force_free);
   dscene->attributes_uchar4.free_if_need_realloc(force_free);
 
   /* Signal for shaders like displacement not to do ray tracing. */
diff --git a/intern/cycles/scene/geometry.h b/intern/cycles/scene/geometry.h
index 335bcdcd0b7..91799d7fde8 100644
--- a/intern/cycles/scene/geometry.h
+++ b/intern/cycles/scene/geometry.h
@@ -257,8 +257,10 @@ class GeometryManager {
                                               size_t &attr_float_offset,
                                               device_vector<float2> &attr_float2,
                                               size_t &attr_float2_offset,
-                                              device_vector<float4> &attr_float3,
+                                              device_vector<packed_float3> &attr_float3,
                                               size_t &attr_float3_offset,
+                                              device_vector<float4> &attr_float4,
+                                              size_t &attr_float4_offset,
                                               device_vector<uchar4> &attr_uchar4,
                                               size_t &attr_uchar4_offset,
                                               Attribute *mattr,
diff --git a/intern/cycles/scene/image.cpp b/intern/cycles/scene/image.cpp
index 80091e01b8c..8bb2d87fd1e 100644
--- a/intern/cycles/scene/image.cpp
+++ b/intern/cycles/scene/image.cpp
@@ -303,7 +303,6 @@ ImageManager::ImageManager(const DeviceInfo &info)
   animation_frame = 0;
 
   /* Set image limits */
-  features.has_half_float = info.has_half_images;
   features.has_nanovdb = info.has_nanovdb;
 }
 
@@ -357,8 +356,6 @@ void ImageManager::load_image_metadata(Image *img)
 
   metadata.detect_colorspace();
 
-  assert(features.has_half_float ||
-         (metadata.type != IMAGE_DATA_TYPE_HALF4 && metadata.type != IMAGE_DATA_TYPE_HALF));
   assert(features.has_nanovdb || (metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
                                   metadata.type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3));
 
diff --git a/intern/cycles/scene/image.h b/intern/cycles/scene/image.h
index 6447b028ebf..7cf09dd6d8f 100644
--- a/intern/cycles/scene/image.h
+++ b/intern/cycles/scene/image.h
@@ -100,7 +100,6 @@ class ImageMetaData {
 /* Information about supported features that Image loaders can use. */
 class ImageDeviceFeatures {
  public:
-  bool has_half_float;
   bool has_nanovdb;
 };
 
diff --git a/intern/cycles/scene/image_oiio.cpp b/intern/cycles/scene/image_oiio.cpp
index feafae035a1..4cea7fbfb01 100644
--- a/intern/cycles/scene/image_oiio.cpp
+++ b/intern/cycles/scene/image_oiio.cpp
@@ -30,7 +30,8 @@ OIIOImageLoader::~OIIOImageLoader()
 {
 }
 
-bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMetaData &metadata)
+bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures & /*features*/,
+                                    ImageMetaData &metadata)
 {
   /* Perform preliminary checks, with meaningful logging. */
   if (!path_exists(filepath.string())) {
@@ -76,7 +77,7 @@ bool OIIOImageLoader::load_metadata(const ImageDeviceFeatures &features, ImageMe
   }
 
   /* check if it's half float */
-  if (spec.format == TypeDesc::HALF && features.has_half_float) {
+  if (spec.format == TypeDesc::HALF) {
     is_half = true;
   }
 
diff --git a/intern/cycles/scene/integrator.cpp b/intern/cycles/scene/integrator.cpp
index a97833a8d32..31e645c1f3a 100644
--- a/intern/cycles/scene/integrator.cpp
+++ b/intern/cycles/scene/integrator.cpp
@@ -54,6 +54,18 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(transparent_min_bounce, "Transparent Min Bounce", 0);
   SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
 
+#ifdef WITH_CYCLES_DEBUG
+  static NodeEnum direct_light_sampling_type_enum;
+  direct_light_sampling_type_enum.insert("multiple_importance_sampling",
+                                         DIRECT_LIGHT_SAMPLING_MIS);
+  direct_light_sampling_type_enum.insert("forward_path_tracing", DIRECT_LIGHT_SAMPLING_FORWARD);
+  direct_light_sampling_type_enum.insert("next_event_estimation", DIRECT_LIGHT_SAMPLING_NEE);
+  SOCKET_ENUM(direct_light_sampling_type,
+              "Direct Light Sampling Type",
+              direct_light_sampling_type_enum,
+              DIRECT_LIGHT_SAMPLING_MIS);
+#endif
+
   SOCKET_INT(ao_bounces, "AO Bounces", 0);
   SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
   SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
@@ -173,6 +185,12 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->ao_bounces_factor = ao_factor;
   kintegrator->ao_additive_factor = ao_additive_factor;
 
+#ifdef WITH_CYCLES_DEBUG
+  kintegrator->direct_light_sampling_type = direct_light_sampling_type;
+#else
+  kintegrator->direct_light_sampling_type = DIRECT_LIGHT_SAMPLING_MIS;
+#endif
+
   /* Transparent Shadows
    * We only need to enable transparent shadows, if we actually have
    * transparent shaders in the scene. Otherwise we can disable it
diff --git a/intern/cycles/scene/integrator.h b/intern/cycles/scene/integrator.h
index 464d96ca01b..52f1b296a20 100644
--- a/intern/cycles/scene/integrator.h
+++ b/intern/cycles/scene/integrator.h
@@ -41,6 +41,10 @@ class Integrator : public Node {
   NODE_SOCKET_API(int, max_transmission_bounce)
   NODE_SOCKET_API(int, max_volume_bounce)
 
+#ifdef WITH_CYCLES_DEBUG
+  NODE_SOCKET_API(DirectLightSamplingType, direct_light_sampling_type)
+#endif
+
   NODE_SOCKET_API(int, transparent_min_bounce)
   NODE_SOCKET_API(int, transparent_max_bounce)
 
diff --git a/intern/cycles/scene/mesh.cpp b/intern/cycles/scene/mesh.cpp
index f47dab30869..e65b8462e34 100644
--- a/intern/cycles/scene/mesh.cpp
+++ b/intern/cycles/scene/mesh.cpp
@@ -707,7 +707,7 @@ void Mesh::pack_shaders(Scene *scene, uint *tri_shader)
   }
 }
 
-void Mesh::pack_normals(float4 *vnormal)
+void Mesh::pack_normals(packed_float3 *vnormal)
 {
   Attribute *attr_vN = attributes.find(ATTR_STD_VERTEX_NORMAL);
   if (attr_vN == NULL) {
@@ -727,11 +727,14 @@ void Mesh::pack_normals(float4 *vnormal)
     if (do_transform)
       vNi = safe_normalize(transform_direction(&ntfm, vNi));
 
-    vnormal[i] = make_float4(vNi.x, vNi.y, vNi.z, 0.0f);
+    vnormal[i] = make_float3(vNi.x, vNi.y, vNi.z);
   }
 }
 
-void Mesh::pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, float2 *tri_patch_uv)
+void Mesh::pack_verts(packed_float3 *tri_verts,
+                      uint4 *tri_vindex,
+                      uint *tri_patch,
+                      float2 *tri_patch_uv)
 {
   size_t verts_size = verts.size();
 
@@ -752,9 +755,9 @@ void Mesh::pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, flo
 
     tri_patch[i] = (!get_num_subd_faces()) ? -1 : (triangle_patch[i] * 8 + patch_offset);
 
-    tri_verts[i * 3] = float3_to_float4(verts[t.v[0]]);
-    tri_verts[i * 3 + 1] = float3_to_float4(verts[t.v[1]]);
-    tri_verts[i * 3 + 2] = float3_to_float4(verts[t.v[2]]);
+    tri_verts[i * 3] = verts[t.v[0]];
+    tri_verts[i * 3 + 1] = verts[t.v[1]];
+    tri_verts[i * 3 + 2] = verts[t.v[2]];
   }
 }
 
diff --git a/intern/cycles/scene/mesh.h b/intern/cycles/scene/mesh.h
index d13b3003164..254672d0620 100644
--- a/intern/cycles/scene/mesh.h
+++ b/intern/cycles/scene/mesh.h
@@ -223,8 +223,11 @@ class Mesh : public Geometry {
   void get_uv_tiles(ustring map, unordered_set<int> &tiles) override;
 
   void pack_shaders(Scene *scene, uint *shader);
-  void pack_normals(float4 *vnormal);
-  void pack_verts(float4 *tri_verts, uint4 *tri_vindex, uint *tri_patch, float2 *tri_patch_uv);
+  void pack_normals(packed_float3 *vnormal);
+  void pack_verts(packed_float3 *tri_verts,
+                  uint4 *tri_vindex,
+                  uint *tri_patch,
+                  float2 *tri_patch_uv);
   void pack_patches(uint *patch_data);
 
   PrimitiveType primitive_type() const override;
diff --git a/intern/cycles/scene/mesh_subdivision.cpp b/intern/cycles/scene/mesh_subdivision.cpp
index a0c0bc68f8b..35f15cfafbc 100644
--- a/intern/cycles/scene/mesh_subdivision.cpp
+++ b/intern/cycles/scene/mesh_subdivision.cpp
@@ -331,7 +331,8 @@ struct OsdPatch : Patch {
 
   void eval(float3 *P, float3 *dPdu, float3 *dPdv, float3 *N, float u, float v)
   {
-    const Far::PatchTable::PatchHandle *handle = osd_data->patch_map->FindPatch(patch_index, u, v);
+    const Far::PatchTable::PatchHandle *handle = osd_data->patch_map->FindPatch(
+        patch_index, (double)u, (double)v);
     assert(handle);
 
     float p_weights[20], du_weights[20], dv_weights[20];
diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp
index ef0ee0c6625..4230abe9a1b 100644
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -74,6 +74,7 @@ DeviceScene::DeviceScene(Device *device)
       attributes_float(device, "__attributes_float", MEM_GLOBAL),
       attributes_float2(device, "__attributes_float2", MEM_GLOBAL),
       attributes_float3(device, "__attributes_float3", MEM_GLOBAL),
+      attributes_float4(device, "__attributes_float4", MEM_GLOBAL),
       attributes_uchar4(device, "__attributes_uchar4", MEM_GLOBAL),
       light_distribution(device, "__light_distribution", MEM_GLOBAL),
       lights(device, "__lights", MEM_GLOBAL),
diff --git a/intern/cycles/scene/scene.h b/intern/cycles/scene/scene.h
index fa7fc54602a..4af05349dd3 100644
--- a/intern/cycles/scene/scene.h
+++ b/intern/cycles/scene/scene.h
@@ -81,9 +81,9 @@ class DeviceScene {
   device_vector<float2> prim_time;
 
   /* mesh */
-  device_vector<float4> tri_verts;
+  device_vector<packed_float3> tri_verts;
   device_vector<uint> tri_shader;
-  device_vector<float4> tri_vnormal;
+  device_vector<packed_float3> tri_vnormal;
   device_vector<uint4> tri_vindex;
   device_vector<uint> tri_patch;
   device_vector<float2> tri_patch_uv;
@@ -108,7 +108,8 @@ class DeviceScene {
   device_vector<uint4> attributes_map;
   device_vector<float> attributes_float;
   device_vector<float2> attributes_float2;
-  device_vector<float4> attributes_float3;
+  device_vector<packed_float3> attributes_float3;
+  device_vector<float4> attributes_float4;
   device_vector<uchar4> attributes_uchar4;
 
   /* lights */
diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp
index 8a9ef56b0ae..8c20807a52b 100644
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -34,6 +34,8 @@
 #include "util/log.h"
 #include "util/transform.h"
 
+#include "kernel/tables.h"
+
 #include "kernel/svm/color_util.h"
 #include "kernel/svm/mapping_util.h"
 #include "kernel/svm/math_util.h"
diff --git a/intern/cycles/session/session.cpp b/intern/cycles/session/session.cpp
index 530baa8cafb..af5c6b3f1fd 100644
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@@ -262,6 +262,7 @@ RenderWork Session::run_update_for_next_iteration()
   }
 
   render_scheduler_.set_num_samples(params.samples);
+  render_scheduler_.set_start_sample(params.sample_offset);
   render_scheduler_.set_time_limit(params.time_limit);
 
   while (have_tiles) {
@@ -409,7 +410,7 @@ void Session::do_delayed_reset()
 
   /* Tile and work scheduling. */
   tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size());
-  render_scheduler_.reset(buffer_params_, params.samples);
+  render_scheduler_.reset(buffer_params_, params.samples, params.sample_offset);
 
   /* Passes. */
   /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial
diff --git a/intern/cycles/session/session.h b/intern/cycles/session/session.h
index 1ec0c6e9bb1..3f73593f008 100644
--- a/intern/cycles/session/session.h
+++ b/intern/cycles/session/session.h
@@ -54,6 +54,7 @@ class SessionParams {
 
   bool experimental;
   int samples;
+  int sample_offset;
   int pixel_size;
   int threads;
 
@@ -75,6 +76,7 @@ class SessionParams {
 
     experimental = false;
     samples = 1024;
+    sample_offset = 0;
     pixel_size = 1;
     threads = 0;
     time_limit = 0.0;
diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h
index faba411c769..afc3fd019df 100644
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -63,6 +63,62 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile float *dest,
 
 #  endif /* __KERNEL_CUDA__ */
 
+#  ifdef __KERNEL_METAL__
+
+// global address space versions
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_source,
+                                                   const float operand)
+{
+  volatile ccl_global atomic_int *source = (ccl_global atomic_int *)_source;
+  union {
+    int int_value;
+    float float_value;
+  } new_value, prev_value;
+  prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
+  do {
+    new_value.float_value = prev_value.float_value + operand;
+  } while (!atomic_compare_exchange_weak_explicit(source,
+                                                  &prev_value.int_value,
+                                                  new_value.int_value,
+                                                  memory_order_relaxed,
+                                                  memory_order_relaxed));
+
+  return new_value.float_value;
+}
+
+#    define atomic_fetch_and_add_uint32(p, x) \
+      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+#    define atomic_fetch_and_sub_uint32(p, x) \
+      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+#    define atomic_fetch_and_inc_uint32(p) \
+      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+#    define atomic_fetch_and_dec_uint32(p) \
+      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
+#    define atomic_fetch_and_or_uint32(p, x) \
+      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+
+ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
+                                                      const float old_val,
+                                                      const float new_val)
+{
+  int prev_value;
+  prev_value = __float_as_int(old_val);
+  atomic_compare_exchange_weak_explicit((ccl_global atomic_int *)dest,
+                                        &prev_value,
+                                        __float_as_int(new_val),
+                                        memory_order_relaxed,
+                                        memory_order_relaxed);
+  return __int_as_float(prev_value);
+}
+
+#    define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
+#    define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
+
+#    define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
+#    define ccl_barrier(flags) threadgroup_barrier(flags)
+
+#  endif /* __KERNEL_METAL__ */
+
 #endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/debug.cpp b/intern/cycles/util/debug.cpp
index 7d5b6d4e54e..717e55a2c9a 100644
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -64,6 +64,11 @@ DebugFlags::HIP::HIP() : adaptive_compile(false)
   reset();
 }
 
+DebugFlags::Metal::Metal() : adaptive_compile(false)
+{
+  reset();
+}
+
 void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
@@ -76,6 +81,12 @@ void DebugFlags::HIP::reset()
     adaptive_compile = true;
 }
 
+void DebugFlags::Metal::reset()
+{
+  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
+    adaptive_compile = true;
+}
+
 DebugFlags::OptiX::OptiX()
 {
   reset();
@@ -97,6 +108,7 @@ void DebugFlags::reset()
   cpu.reset();
   cuda.reset();
   optix.reset();
+  metal.reset();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/debug.h b/intern/cycles/util/debug.h
index 548c67600e5..1e431fde68a 100644
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -116,6 +116,17 @@ class DebugFlags {
     bool use_debug;
   };
 
+  /* Descriptor of Metal feature-set to be used. */
+  struct Metal {
+    Metal();
+
+    /* Reset flags to their defaults. */
+    void reset();
+
+    /* Whether adaptive feature based runtime compile is enabled or not.*/
+    bool adaptive_compile;
+  };
+
   /* Get instance of debug flags registry. */
   static DebugFlags &get()
   {
@@ -138,6 +149,9 @@ class DebugFlags {
   /* Requested HIP flags. */
   HIP hip;
 
+  /* Requested Metal flags. */
+  Metal metal;
+
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index a778bef52b2..a2e8d83adb7 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -44,6 +44,7 @@
 #  if defined(_WIN32) && !defined(FREE_WINDOWS)
 #    define ccl_device_inline static __forceinline
 #    define ccl_device_forceinline static __forceinline
+#    define ccl_device_inline_method __forceinline
 #    define ccl_align(...) __declspec(align(__VA_ARGS__))
 #    ifdef __KERNEL_64_BIT__
 #      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
@@ -58,6 +59,7 @@
 #  else /* _WIN32 && !FREE_WINDOWS */
 #    define ccl_device_inline static inline __attribute__((always_inline))
 #    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_device_inline_method __attribute__((always_inline))
 #    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
 #    ifndef FREE_WINDOWS64
 #      define __forceinline inline __attribute__((always_inline))
@@ -70,7 +72,7 @@
 
 /* Address spaces for GPU. */
 #  define ccl_global
-#  define ccl_static_constant static const
+#  define ccl_inline_constant inline constexpr
 #  define ccl_constant const
 #  define ccl_private
 
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index 016975e3c25..555f17259bd 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -28,8 +28,27 @@ CCL_NAMESPACE_BEGIN
 
 /* Half Floats */
 
+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float half_to_float(half h_in)
+{
+  float f;
+  union {
+    half h;
+    uint16_t s;
+  } val;
+  val.h = h_in;
+
+  *((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) |
+                             ((val.s & 0x03FF) << 13);
+
+  return f;
+}
+
+#else
+
 /* CUDA has its own half data type, no need to define then */
-#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
+#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -53,11 +72,12 @@ class half {
  private:
   unsigned short v;
 };
-#endif
+#  endif
 
 struct half4 {
   half x, y, z, w;
 };
+#endif
 
 /* Conversion to/from half float for image textures
  *
@@ -66,7 +86,9 @@ struct half4 {
 
 ccl_device_inline half float_to_half_image(float f)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(f);
 #else
   const uint u = __float_as_uint(f);
@@ -92,7 +114,9 @@ ccl_device_inline half float_to_half_image(float f)
 
 ccl_device_inline float half_to_float_image(half h)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half_to_float(h);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __half2float(h);
 #else
   const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
@@ -125,7 +149,9 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
 
 ccl_device_inline half float_to_half_display(const float f)
 {
-#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
+#if defined(__KERNEL_METAL__)
+  return half(f);
+#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(f);
 #else
   const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
diff --git a/intern/cycles/util/ies.cpp b/intern/cycles/util/ies.cpp
index 5e879478df5..e924d660407 100644
--- a/intern/cycles/util/ies.cpp
+++ b/intern/cycles/util/ies.cpp
@@ -23,10 +23,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-// NOTE: For some reason gcc-7.2 does not instantiate this versio of allocator
-// gere (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8.
+// NOTE: For some reason gcc-7.2 does not instantiate this version of the
+// allocator here (used in IESTextParser). Works fine for gcc-6, gcc-7.3 and gcc-8.
 //
-// TODO(sergey): Get to the root of this issue, or confirm this i a compiler
+// TODO(sergey): Get to the root of this issue, or confirm this is a compiler
 // issue.
 template class GuardedAllocator<char>;
 
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index e4c7df6e44a..6cfeb1aa917 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -30,9 +30,11 @@
 #  include <hip/hip_vector_types.h>
 #endif
 
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
+#if !defined(__KERNEL_METAL__)
+#  include <float.h>
+#  include <math.h>
+#  include <stdio.h>
+#endif /* !defined(__KERNEL_METAL__) */
 
 #include "util/types.h"
 
@@ -174,6 +176,7 @@ ccl_device_inline float max4(float a, float b, float c, float d)
   return max(max(a, b), max(c, d));
 }
 
+#if !defined(__KERNEL_METAL__)
 /* Int/Float conversion */
 
 ccl_device_inline int as_int(uint i)
@@ -206,7 +209,7 @@ ccl_device_inline uint as_uint(float f)
   return u.i;
 }
 
-#ifndef __HIP__
+#  ifndef __HIP__
 ccl_device_inline int __float_as_int(float f)
 {
   union {
@@ -246,28 +249,33 @@ ccl_device_inline float __uint_as_float(uint i)
   u.i = i;
   return u.f;
 }
-#endif
+#  endif
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(f.m128));
-#else
+#  else
   return make_int4(
       __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 __int4_as_float4(int4 i)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_castsi128_ps(i.m128));
-#else
+#  else
   return make_float4(
       __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#endif
+#  endif
 }
+#endif /* !defined(__KERNEL_METAL__) */
 
+#if defined(__KERNEL_METAL__)
+#  define isnan_safe(v) isnan(v)
+#  define isfinite_safe(v) isfinite(v)
+#else
 template<typename T> ccl_device_inline uint pointer_pack_to_uint_0(T *ptr)
 {
   return ((uint64_t)ptr) & 0xFFFFFFFF;
@@ -311,12 +319,14 @@ ccl_device_inline bool isfinite_safe(float f)
   unsigned int x = __float_as_uint(f);
   return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f * f)) && !((x << 1) > 0xff000000u);
 }
+#endif
 
 ccl_device_inline float ensure_finite(float v)
 {
   return isfinite_safe(v) ? v : 0.0f;
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int clamp(int a, int mn, int mx)
 {
   return min(max(a, mn), mx);
@@ -346,15 +356,17 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
   return result;
 }
 
-#ifndef __KERNEL_CUDA__
+#endif /* !defined(__KERNEL_METAL__) */
+
+#if defined(__KERNEL_CUDA__)
 ccl_device_inline float saturatef(float a)
 {
-  return clamp(a, 0.0f, 1.0f);
+  return __saturatef(a);
 }
-#else
+#elif !defined(__KERNEL_METAL__)
 ccl_device_inline float saturatef(float a)
 {
-  return __saturatef(a);
+  return clamp(a, 0.0f, 1.0f);
 }
 #endif /* __KERNEL_CUDA__ */
 
@@ -491,6 +503,7 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
+#if !defined(__KERNEL_METAL__)
 /* Interpolation */
 
 template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -498,6 +511,8 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
   return (A)(a * ((B)1 - t) + b * t);
 }
 
+#endif /* __KERNEL_METAL__ */
+
 /* Triangle */
 
 ccl_device_inline float triangle_area(ccl_private const float3 &v1,
@@ -627,7 +642,11 @@ ccl_device_inline float safe_sqrtf(float f)
 
 ccl_device_inline float inversesqrtf(float f)
 {
+#if defined(__KERNEL_METAL__)
+  return (f > 0.0f) ? rsqrt(f) : 0.0f;
+#else
   return (f > 0.0f) ? 1.0f / sqrtf(f) : 0.0f;
+#endif
 }
 
 ccl_device float safe_asinf(float a)
@@ -715,10 +734,30 @@ ccl_device float bits_to_01(uint bits)
   return bits * (1.0f / (float)0xFFFFFFFF);
 }
 
+#if !defined(__KERNEL_GPU__)
+#  if defined(__GNUC__)
+#    define popcount(x) __builtin_popcount(x)
+#  else
+ccl_device_inline uint popcount(uint x)
+{
+  /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
+  uint i = x & 0xaaaaaaaa;
+  i = i - ((i >> 1) & 0x55555555);
+  i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+  i = (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
+  return i & 1;
+}
+#  endif
+#elif !defined(__KERNEL_METAL__)
+#  define popcount(x) __popc(x)
+#endif
+
 ccl_device_inline uint count_leading_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __clz(x);
+#elif defined(__KERNEL_METAL__)
+  return clz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -735,6 +774,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return (__ffs(x) - 1);
+#elif defined(__KERNEL_METAL__)
+  return ctz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -751,6 +792,8 @@ ccl_device_inline uint find_first_set(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__)
   return __ffs(x);
+#elif defined(__KERNEL_METAL__)
+  return (x != 0) ? ctz(x) + 1 : 0;
 #else
 #  ifdef _MSC_VER
   return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -801,7 +844,7 @@ ccl_device_inline float2 map_to_sphere(const float3 co)
  * https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
  */
 
-ccl_device_inline float compare_floats(float a, float b, float abs_diff, int ulp_diff)
+ccl_device_inline bool compare_floats(float a, float b, float abs_diff, int ulp_diff)
 {
   if (fabsf(a - b) < abs_diff) {
     return true;
@@ -849,6 +892,8 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
   return x;
 #elif defined(__KERNEL_CUDA__)
   return __brev(x);
+#elif defined(__KERNEL_METAL__)
+  return reverse_bits(x);
 #elif __has_builtin(__builtin_bitreverse32)
   return __builtin_bitreverse32(x);
 #else
diff --git a/intern/cycles/util/math_float2.h b/intern/cycles/util/math_float2.h
index 87141d5bc37..8ff75c6c20a 100644
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float2 operator-(const float2 &a);
 ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
 ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -63,6 +64,7 @@ ccl_device_inline float2 fabs(const float2 &a);
 ccl_device_inline float2 as_float2(const float4 &a);
 ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
 ccl_device_inline float2 floor(const float2 &a);
+#endif /* !__KERNEL_METAL__ */
 
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
 
@@ -80,6 +82,7 @@ ccl_device_inline float2 one_float2()
   return make_float2(1.0f, 1.0f);
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float2 operator-(const float2 &a)
 {
   return make_float2(-a.x, -a.y);
@@ -259,6 +262,8 @@ ccl_device_inline float2 floor(const float2 &a)
   return make_float2(floorf(a.x), floorf(a.y));
 }
 
+#endif /* !__KERNEL_METAL__ */
+
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/math_float3.h b/intern/cycles/util/math_float3.h
index 81550c5d03c..1a0213f2a6d 100644
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 operator-(const float3 &a);
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
 ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -62,19 +63,20 @@ ccl_device_inline float3 rcp(const float3 &a);
 ccl_device_inline float3 sqrt(const float3 &a);
 ccl_device_inline float3 floor(const float3 &a);
 ccl_device_inline float3 ceil(const float3 &a);
+ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
+#endif /* !defined(__KERNEL_METAL__) */
 
 ccl_device_inline float min3(float3 a);
 ccl_device_inline float max3(float3 a);
 ccl_device_inline float len(const float3 a);
 ccl_device_inline float len_squared(const float3 a);
 
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
 ccl_device_inline float3 project(const float3 v, const float3 v_proj);
 
 ccl_device_inline float3 saturate3(float3 a);
 ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, float *t);
+ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
+ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
 ccl_device_inline float3 safe_divide_float3_float3(const float3 a, const float3 b);
 ccl_device_inline float3 safe_divide_float3_float(const float3 a, const float b);
 ccl_device_inline float3 interp(float3 a, float3 b, float t);
@@ -103,49 +105,58 @@ ccl_device_inline float3 one_float3()
   return make_float3(1.0f, 1.0f, 1.0f);
 }
 
+#if defined(__KERNEL_METAL__)
+
+ccl_device_inline float3 rcp(float3 a)
+{
+  return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
+}
+
+#else
+
 ccl_device_inline float3 operator-(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#else
+#  else
   return make_float3(-a.x, -a.y, -a.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float f)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-#else
+#  else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator*(const float f, const float3 &a)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#else
+#  else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator/(const float f, const float3 &a)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-#else
+#  else
   return make_float3(f / a.x, f / a.y, f / a.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -156,11 +167,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -170,11 +181,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
 
 ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_add_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -184,11 +195,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
 
 ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_sub_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -222,13 +233,39 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
   return a = a * invf;
 }
 
+#if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+{
+  a = float3(a) * b;
+  return a;
+}
+
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
+{
+  a = float3(a) * f;
+  return a;
+}
+
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+{
+  a = float3(a) / b;
+  return a;
+}
+
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
+{
+  a = float3(a) / f;
+  return a;
+}
+#endif
+
 ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#else
+#  else
   return (a.x == b.x && a.y == b.y && a.z == b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -243,20 +280,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
 
 ccl_device_inline float dot(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#else
+#  else
   return a.x * b.x + a.y * b.y + a.z * b.z;
-#endif
+#  endif
 }
 
 ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#else
+#  else
   return a.x * b.x + a.y * b.y;
-#endif
+#  endif
 }
 
 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -267,30 +304,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 
 ccl_device_inline float3 normalize(const float3 &a)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
   return float3(_mm_div_ps(a.m128, norm));
-#else
+#  else
   return a / len(a);
-#endif
+#  endif
 }
 
 ccl_device_inline float3 min(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_min_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 max(const float3 &a, const float3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_max_ps(a.m128, b.m128));
-#else
+#  else
   return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -300,43 +337,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 
 ccl_device_inline float3 fabs(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
-#  ifdef __KERNEL_NEON__
+#  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_NEON__
   return float3(vabsq_f32(a.m128));
-#  else
+#    else
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
   return float3(_mm_and_ps(a.m128, mask));
-#  endif
-#else
+#    endif
+#  else
   return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 sqrt(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_sqrt_ps(a));
-#else
+#  else
   return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 floor(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_floor_ps(a));
-#else
+#  else
   return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 ceil(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float3(_mm_ceil_ps(a));
-#else
+#  else
   return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-#endif
+#  endif
 }
 
 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -346,13 +383,14 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
 
 ccl_device_inline float3 rcp(const float3 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+#  else
   return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-#endif
+#  endif
 }
+#endif /* !__KERNEL_METAL__ */
 
 ccl_device_inline float min3(float3 a)
 {
@@ -378,6 +416,7 @@ ccl_device_inline float len_squared(const float3 a)
   return dot(a, a);
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
 {
   float3 unit_normal = normalize(normal);
@@ -399,6 +438,7 @@ ccl_device_inline float3 faceforward(const float3 vector,
 {
   return (dot(reference, incident) < 0.0f) ? vector : -vector;
 }
+#endif
 
 ccl_device_inline float3 project(const float3 v, const float3 v_proj)
 {
@@ -479,7 +519,11 @@ ccl_device_inline float average(const float3 a)
 
 ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 {
+#if defined(__KERNEL_METAL__)
+  return all(a == b);
+#else
   return a == b;
+#endif
 }
 
 ccl_device_inline float3 pow3(float3 v, float e)
diff --git a/intern/cycles/util/math_float4.h b/intern/cycles/util/math_float4.h
index c76959ee7ff..1203a10cca4 100644
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -27,6 +27,7 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a);
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
 ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -65,6 +66,7 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 ccl_device_inline float4 fabs(const float4 &a);
 ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
+#endif /* !__KERNEL_METAL__*/
 
 ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
 
@@ -110,32 +112,33 @@ ccl_device_inline float4 one_float4()
   return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
   return float4(_mm_xor_ps(a.m128, mask));
-#else
+#  else
   return make_float4(-a.x, -a.y, -a.z, -a.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_mul_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, float f)
 {
-#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_SSE__)
   return a * make_float4(f);
-#else
+#  else
   return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -150,11 +153,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
 
 ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_div_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -164,11 +167,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
 
 ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_add_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -178,11 +181,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
 
 ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_sub_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -212,38 +215,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
 
 ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-#else
+#  else
   return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-#else
+#  else
   return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-#else
+#  else
   return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#else
+#  else
   return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -253,16 +256,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
   __m128 t = vmulq_f32(a, b);
   return vaddvq_f32(t);
-#  else
+#    else
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#  endif
-#else
+#    endif
+#  else
   return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float len_squared(const float4 &a)
@@ -272,21 +275,21 @@ ccl_device_inline float len_squared(const float4 &a)
 
 ccl_device_inline float4 rcp(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#else
+#  else
   return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 sqrt(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_sqrt_ps(a.m128));
-#else
+#  else
   return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 sqr(const float4 &a)
@@ -296,39 +299,39 @@ ccl_device_inline float4 sqr(const float4 &a)
 
 ccl_device_inline float4 cross(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
          (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#else
+#  else
   return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#endif
+#  endif
 }
 
 ccl_device_inline bool is_zero(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return a == make_float4(0.0f);
-#else
+#  else
   return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#endif
+#  endif
 }
 
 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#if defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
   return float4(vdupq_n_f32(vaddvq_f32(a)));
-#  elif defined(__KERNEL_SSE3__)
+#    elif defined(__KERNEL_SSE3__)
   float4 h(_mm_hadd_ps(a.m128, a.m128));
   return float4(_mm_hadd_ps(h.m128, h.m128));
-#  else
+#    else
   float4 h(shuffle<1, 0, 3, 2>(a) + a);
   return shuffle<2, 3, 0, 1>(h) + h;
-#  endif
-#else
+#    endif
+#  else
   float sum = (a.x + a.y) + (a.z + a.w);
   return make_float4(sum, sum, sum, sum);
-#endif
+#  endif
 }
 
 ccl_device_inline float average(const float4 &a)
@@ -354,20 +357,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
 
 ccl_device_inline float4 min(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_min_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 max(const float4 &a, const float4 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_max_ps(a.m128, b.m128));
-#else
+#  else
   return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -377,24 +380,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 
 ccl_device_inline float4 fabs(const float4 &a)
 {
-#if defined(__KERNEL_SSE__)
-#  if defined(__KERNEL_NEON__)
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
   return float4(vabsq_f32(a));
-#  else
+#    else
   return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#  endif
-#else
+#    endif
+#  else
   return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 floor(const float4 &a)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return float4(_mm_floor_ps(a));
-#else
+#  else
   return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#endif
+#  endif
 }
 
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -402,6 +405,8 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
   return a + t * (b - a);
 }
 
+#endif /* !__KERNEL_METAL__*/
+
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
diff --git a/intern/cycles/util/math_int2.h b/intern/cycles/util/math_int2.h
index 5b04be92152..39dc3b28f11 100644
--- a/intern/cycles/util/math_int2.h
+++ b/intern/cycles/util/math_int2.h
@@ -27,17 +27,20 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b);
 ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
 ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
+#endif /* !__KERNEL_METAL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
   return (a.x == b.x && a.y == b.y);
@@ -67,6 +70,7 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
 {
   return make_int2(a.x / b.x, a.y / b.y);
 }
+#endif /* !__KERNEL_METAL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/math_int3.h b/intern/cycles/util/math_int3.h
index 128f2cb53b8..a09c68ef49a 100644
--- a/intern/cycles/util/math_int3.h
+++ b/intern/cycles/util/math_int3.h
@@ -27,49 +27,52 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b);
 ccl_device_inline int3 max(int3 a, int3 b);
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
+#endif /*  !defined(__KERNEL_METAL__) */
 
 /*******************************************************************************
  * Definition.
  */
 
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline int3 min(int3 a, int3 b)
 {
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_min_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline int3 max(int3 a, int3 b)
 {
-#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_max_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#endif
+#  endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return min(max(a, make_int3(mn)), make_int3(mx));
-#else
+#  else
   return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#endif
+#  endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return min(max(a, mn), make_int3(mx));
-#else
+#  else
   return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#endif
+#  endif
 }
 
 ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -89,21 +92,22 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
 
 ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int3(_mm_add_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-#endif
+#  endif
 }
 
 ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
 {
-#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE__
   return int3(_mm_sub_epi32(a.m128, b.m128));
-#else
+#  else
   return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-#endif
+#  endif
 }
+#endif /* !__KERNEL_METAL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/math_intersect.h b/intern/cycles/util/math_intersect.h
index 0c431a36afb..54ce3ab4b66 100644
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -88,29 +88,16 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
 ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
                                                    float3 ray_dir,
                                                    float ray_t,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                                                   const ssef *ssef_verts,
-#else
                                                    const float3 tri_a,
                                                    const float3 tri_b,
                                                    const float3 tri_c,
-#endif
                                                    ccl_private float *isect_u,
                                                    ccl_private float *isect_v,
                                                    ccl_private float *isect_t)
 {
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  typedef ssef float3;
-  const float3 tri_a(ssef_verts[0]);
-  const float3 tri_b(ssef_verts[1]);
-  const float3 tri_c(ssef_verts[2]);
-  const float3 P(ray_P);
-  const float3 dir(ray_dir);
-#else
-#  define dot3(a, b) dot(a, b)
+#define dot3(a, b) dot(a, b)
   const float3 P = ray_P;
   const float3 dir = ray_dir;
-#endif
 
   /* Calculate vertices relative to ray origin. */
   const float3 v0 = tri_c - P;
@@ -123,43 +110,16 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
   const float3 e2 = v1 - v2;
 
   /* Perform edge tests. */
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const float3 crossU = cross(v2 + v0, e0);
-  const float3 crossV = cross(v0 + v1, e1);
-  const float3 crossW = cross(v1 + v2, e2);
-
-  ssef crossX(crossU);
-  ssef crossY(crossV);
-  ssef crossZ(crossW);
-  ssef zero = _mm_setzero_ps();
-  _MM_TRANSPOSE4_PS(crossX, crossY, crossZ, zero);
-
-  const ssef dirX(ray_dir.x);
-  const ssef dirY(ray_dir.y);
-  const ssef dirZ(ray_dir.z);
-
-  ssef UVWW = madd(crossX, dirX, madd(crossY, dirY, crossZ * dirZ));
-#else  /* __KERNEL_SSE2__ */
   const float U = dot(cross(v2 + v0, e0), ray_dir);
   const float V = dot(cross(v0 + v1, e1), ray_dir);
   const float W = dot(cross(v1 + v2, e2), ray_dir);
-#endif /* __KERNEL_SSE2__ */
 
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  int uvw_sign = movemask(UVWW) & 0x7;
-  if (uvw_sign != 0) {
-    if (uvw_sign != 0x7) {
-      return false;
-    }
-  }
-#else
   const float minUVW = min(U, min(V, W));
   const float maxUVW = max(U, max(V, W));
 
   if (minUVW < 0.0f && maxUVW > 0.0f) {
     return false;
   }
-#endif
 
   /* Calculate geometry normal and denominator. */
   const float3 Ng1 = cross(e1, e0);
@@ -180,14 +140,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
   }
 
   const float inv_den = 1.0f / den;
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  UVWW *= inv_den;
-  _mm_store_ss(isect_u, UVWW);
-  _mm_store_ss(isect_v, shuffle<1, 1, 3, 3>(UVWW));
-#else
   *isect_u = U * inv_den;
   *isect_v = V * inv_den;
-#endif
   *isect_t = T * inv_den;
   return true;
 
diff --git a/intern/cycles/util/math_matrix.h b/intern/cycles/util/math_matrix.h
index bff7ddb4cee..c1be71517e3 100644
--- a/intern/cycles/util/math_matrix.h
+++ b/intern/cycles/util/math_matrix.h
@@ -162,7 +162,7 @@ ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A,
 {
   for (int row = 0; row < n; row++) {
     for (int col = 0; col <= row; col++) {
-      MATHS(A, row, col, 1) += v[row] * v[col] * weight;
+      atomic_add_and_fetch_float(&MATHS(A, row, col, 1), v[row] * v[col] * weight);
     }
   }
 }
diff --git a/intern/cycles/util/path.cpp b/intern/cycles/util/path.cpp
index 5704c4ef8ef..aad790482d5 100644
--- a/intern/cycles/util/path.cpp
+++ b/intern/cycles/util/path.cpp
@@ -313,7 +313,7 @@ static char *path_specials(const string &sub)
   if (env_shader_path != NULL && sub == "shader") {
     return env_shader_path;
   }
-  else if (env_shader_path != NULL && sub == "source") {
+  else if (env_source_path != NULL && sub == "source") {
     return env_source_path;
   }
   return NULL;
@@ -541,7 +541,7 @@ static string path_make_compatible(const string &path)
   if ((path.size() >= 3) && (path[0] == DIR_SEP) && (path[1] == DIR_SEP)) {
     result = path_cleanup_unc(result);
   }
-  /* Make sure volume-only path ends up wit ha directory separator. */
+  /* Make sure volume-only path ends up wit a directory separator. */
   if (result.size() == 2 && result[1] == ':') {
     result += DIR_SEP;
   }
diff --git a/intern/cycles/util/progress.h b/intern/cycles/util/progress.h
index f2d80e49ab8..15bd26d34bf 100644
--- a/intern/cycles/util/progress.h
+++ b/intern/cycles/util/progress.h
@@ -207,7 +207,7 @@ class Progress {
     if (total_pixel_samples > 0) {
       return ((double)pixel_samples) / (double)total_pixel_samples;
     }
-    return 0.0f;
+    return 0.0;
   }
 
   void add_samples(uint64_t pixel_samples_, int tile_sample)
diff --git a/intern/cycles/util/ssef.h b/intern/cycles/util/ssef.h
index ea5e78b54d2..fc496e55a0c 100644
--- a/intern/cycles/util/ssef.h
+++ b/intern/cycles/util/ssef.h
@@ -906,7 +906,7 @@ __forceinline void store4f_nt(void *ptr, const ssef &v)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-/// Euclidian Space Operators
+/// Euclidean Space Operators
 ////////////////////////////////////////////////////////////////////////////////
 
 __forceinline float dot(const ssef &a, const ssef &b)
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index 7bfe747fcfb..1d78dfd1385 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -53,6 +53,15 @@ typedef struct DecomposedTransform {
 
 /* Functions */
 
+#ifdef __KERNEL_METAL__
+/* transform_point specialized for ccl_global */
+ccl_device_inline float3 transform_point(ccl_global const Transform *t, const float3 a)
+{
+  ccl_global const float3x3 &b(*(ccl_global const float3x3 *)t);
+  return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
+}
+#endif
+
 ccl_device_inline float3 transform_point(ccl_private const Transform *t, const float3 a)
 {
   /* TODO(sergey): Disabled for now, causes crashes in certain cases. */
@@ -73,6 +82,9 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f
   tmp += w;
 
   return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+  ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+  return (a * b).xyz + make_float3(t->x.w, t->y.w, t->z.w);
 #else
   float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z + t->x.w,
                          a.x * t->y.x + a.y * t->y.y + a.z * t->y.z + t->y.w,
@@ -99,6 +111,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con
   tmp = madd(shuffle<2>(aa), z, tmp);
 
   return float3(tmp.m128);
+#elif defined(__KERNEL_METAL__)
+  ccl_private const float3x3 &b(*(ccl_private const float3x3 *)t);
+  return (a * b).xyz;
 #else
   float3 c = make_float3(a.x * t->x.x + a.y * t->x.y + a.z * t->x.z,
                          a.x * t->y.x + a.y * t->y.y + a.z * t->y.z,
@@ -450,8 +465,8 @@ ccl_device_inline void transform_compose(ccl_private Transform *tfm,
 }
 
 /* Interpolate from array of decomposed transforms. */
-ccl_device void transform_motion_array_interpolate(Transform *tfm,
-                                                   const DecomposedTransform *motion,
+ccl_device void transform_motion_array_interpolate(ccl_private Transform *tfm,
+                                                   ccl_global const DecomposedTransform *motion,
                                                    uint numsteps,
                                                    float time)
 {
@@ -460,8 +475,8 @@ ccl_device void transform_motion_array_interpolate(Transform *tfm,
   int step = min((int)(time * maxstep), maxstep - 1);
   float t = time * maxstep - step;
 
-  const DecomposedTransform *a = motion + step;
-  const DecomposedTransform *b = motion + step + 1;
+  ccl_global const DecomposedTransform *a = motion + step;
+  ccl_global const DecomposedTransform *b = motion + step + 1;
 
   /* Interpolate rotation, translation and scale. */
   DecomposedTransform decomp;
diff --git a/intern/cycles/util/types.h b/intern/cycles/util/types.h
index 697dc2b44ea..58a6d134819 100644
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -17,7 +17,9 @@
 #ifndef __UTIL_TYPES_H__
 #define __UTIL_TYPES_H__
 
-#include <stdlib.h>
+#if !defined(__KERNEL_METAL__)
+#  include <stdlib.h>
+#endif
 
 /* Standard Integer Types */
 
diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h
index f990367e7b8..cafcfebf526 100644
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -55,6 +55,41 @@ ccl_device_inline float3 make_float3(float x, float y, float z);
 ccl_device_inline void print_float3(const char *label, const float3 &a);
 #endif /* __KERNEL_GPU__ */
 
+/* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
+ * CPU SIMD instructions can be used. */
+#if defined(__KERNEL_METAL__)
+/* Metal has native packed_float3. */
+#elif defined(__KERNEL_CUDA__)
+/* CUDA float3 is already packed. */
+typedef float3 packed_float3;
+#else
+/* HIP float3 is not packed (https://github.com/ROCm-Developer-Tools/HIP/issues/706). */
+struct packed_float3 {
+  ccl_device_inline_method packed_float3(){};
+
+  ccl_device_inline_method packed_float3(const float3 &a) : x(a.x), y(a.y), z(a.z)
+  {
+  }
+
+  ccl_device_inline_method operator float3() const
+  {
+    return make_float3(x, y, z);
+  }
+
+  ccl_device_inline_method packed_float3 &operator=(const float3 &a)
+  {
+    x = a.x;
+    y = a.y;
+    z = a.z;
+    return *this;
+  }
+
+  float x, y, z;
+};
+#endif
+
+static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes");
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_TYPES_FLOAT3_H__ */
diff --git a/intern/ghost/GHOST_C-api.h b/intern/ghost/GHOST_C-api.h
index 784febe8581..98094cc0669 100644
--- a/intern/ghost/GHOST_C-api.h
+++ b/intern/ghost/GHOST_C-api.h
@@ -729,13 +729,6 @@ extern GHOST_TSuccess GHOST_ReleaseOpenGLContext(GHOST_ContextHandle contexthand
 extern unsigned int GHOST_GetContextDefaultOpenGLFramebuffer(GHOST_ContextHandle contexthandle);
 
 /**
- * Returns whether a context is rendered upside down compared to OpenGL. This only needs to be
- * called if there's a non-OpenGL context, which is really the exception.
- * So generally, this does not need to be called.
- */
-extern int GHOST_isUpsideDownContext(GHOST_ContextHandle contexthandle);
-
-/**
  * Get the OpenGL frame-buffer handle that serves as a default frame-buffer.
  */
 extern unsigned int GHOST_GetDefaultOpenGLFramebuffer(GHOST_WindowHandle windwHandle);
diff --git a/intern/ghost/GHOST_Types.h b/intern/ghost/GHOST_Types.h
index 2c8014a08cc..ce0185bc7d0 100644
--- a/intern/ghost/GHOST_Types.h
+++ b/intern/ghost/GHOST_Types.h
@@ -654,8 +654,8 @@ enum {
   GHOST_kXrContextDebug = (1 << 0),
   GHOST_kXrContextDebugTime = (1 << 1),
 #  ifdef WIN32
-  /* Needed to avoid issues with the SteamVR OpenGL graphics binding (use DirectX fallback
-     instead). */
+  /* Needed to avoid issues with the SteamVR OpenGL graphics binding
+   * (use DirectX fallback instead). */
   GHOST_kXrContextGpuNVIDIA = (1 << 2),
 #  endif
 };
diff --git a/intern/ghost/intern/GHOST_SystemCocoa.mm b/intern/ghost/intern/GHOST_SystemCocoa.mm
index 204bbdaec50..b92c3e73a88 100644
--- a/intern/ghost/intern/GHOST_SystemCocoa.mm
+++ b/intern/ghost/intern/GHOST_SystemCocoa.mm
@@ -1245,7 +1245,7 @@ GHOST_TSuccess GHOST_SystemCocoa::handleDraggingEvent(GHOST_TEventType eventType
 
             /* Convert the image in a RGBA 32bit format */
             /* As Core Graphics does not support contexts with non premutliplied alpha,
-             we need to get alpha key values in a separate batch */
+             * we need to get alpha key values in a separate batch */
 
             /* First get RGB values w/o Alpha to avoid pre-multiplication,
              * 32bit but last byte is unused */
@@ -1479,8 +1479,8 @@ GHOST_TSuccess GHOST_SystemCocoa::handleMouseEvent(void *eventPtr)
   CocoaWindow *cocoawindow;
 
   /* [event window] returns other windows if mouse-over, that's OSX input standard
-     however, if mouse exits window(s), the windows become inactive, until you click.
-     We then fall back to the active window from ghost */
+   * however, if mouse exits window(s), the windows become inactive, until you click.
+   * We then fall back to the active window from ghost. */
   window = (GHOST_WindowCocoa *)m_windowManager->getWindowAssociatedWithOSWindow(
       (void *)[event window]);
   if (!window) {
diff --git a/intern/ghost/intern/GHOST_XrAction.cpp b/intern/ghost/intern/GHOST_XrAction.cpp
index 704b1ce9fac..f51f98c9b3d 100644
--- a/intern/ghost/intern/GHOST_XrAction.cpp
+++ b/intern/ghost/intern/GHOST_XrAction.cpp
@@ -216,8 +216,9 @@ GHOST_XrAction::GHOST_XrAction(XrInstance instance,
 
   XrActionCreateInfo action_info{XR_TYPE_ACTION_CREATE_INFO};
   strcpy(action_info.actionName, info.name);
-  strcpy(action_info.localizedActionName, info.name); /* Just use same name for localized. This can
-                                                         be changed in the future if necessary. */
+
+  /* Just use same name for localized. This can be changed in the future if necessary. */
+  strcpy(action_info.localizedActionName, info.name);
 
   switch (info.type) {
     case GHOST_kXrActionTypeBooleanInput:
diff --git a/intern/ghost/intern/GHOST_XrControllerModel.cpp b/intern/ghost/intern/GHOST_XrControllerModel.cpp
index ae15bf11aa0..27f92ffe7c5 100644
--- a/intern/ghost/intern/GHOST_XrControllerModel.cpp
+++ b/intern/ghost/intern/GHOST_XrControllerModel.cpp
@@ -97,8 +97,8 @@ static void read_vertices(const tinygltf::Accessor &accessor,
   validate_accessor(accessor, buffer_view, buffer, stride, packed_size);
 
   /* Resize the vertices vector, if necessary, to include room for the attribute data.
-     If there are multiple attributes for a primitive, the first one will resize, and the
-     subsequent will not need to. */
+   * If there are multiple attributes for a primitive, the first one will resize, and the
+   * subsequent will not need to. */
   primitive.vertices.resize(accessor.count);
 
   /* Copy the attribute value over from the glTF buffer into the appropriate vertex field. */
@@ -147,9 +147,9 @@ static void read_indices(const tinygltf::Accessor &accessor,
                          const tinygltf::Buffer &buffer,
                          GHOST_XrPrimitive &primitive)
 {
-  if (buffer_view.target != TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER &&
-      buffer_view.target != 0) { /* Allow 0 (not specified) even though spec doesn't seem to allow
-                                    this (BoomBox GLB fails). */
+
+  /* Allow 0 (not specified) even though spec doesn't seem to allow this (BoomBox GLB fails). */
+  if (buffer_view.target != TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER && buffer_view.target != 0) {
     throw GHOST_XrException(
         "glTF: Accessor for indices uses bufferview with invalid 'target' type.");
   }
@@ -164,8 +164,8 @@ static void read_indices(const tinygltf::Accessor &accessor,
 
   validate_accessor(accessor, buffer_view, buffer, component_size_bytes, component_size_bytes);
 
-  if ((accessor.count % 3) != 0) { /* Since only triangles are supported, enforce that the number
-                                      of indices is divisible by 3. */
+  /* Since only triangles are supported, enforce that the number of indices is divisible by 3. */
+  if ((accessor.count % 3) != 0) {
     throw GHOST_XrException("glTF: Unexpected number of indices for triangle primitive");
   }
 
diff --git a/intern/guardedalloc/MEM_guardedalloc.h b/intern/guardedalloc/MEM_guardedalloc.h
index 713b1fac788..874abb88ff5 100644
--- a/intern/guardedalloc/MEM_guardedalloc.h
+++ b/intern/guardedalloc/MEM_guardedalloc.h
@@ -78,7 +78,8 @@ extern short (*MEM_testN)(void *vmemh);
 
 /**
  * Duplicates a block of memory, and returns a pointer to the
- * newly allocated block. */
+ * newly allocated block.
+ * NULL-safe; will return NULL when receiving a NULL pointer. */
 extern void *(*MEM_dupallocN)(const void *vmemh) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT;
 
 /**
diff --git a/intern/locale/boost_locale_wrapper.cpp b/intern/locale/boost_locale_wrapper.cpp
index ede9377b38f..444b51b5e04 100644
--- a/intern/locale/boost_locale_wrapper.cpp
+++ b/intern/locale/boost_locale_wrapper.cpp
@@ -26,8 +26,8 @@ static std::string messages_path;
 static std::string default_domain;
 static std::string locale_str;
 
-/* Note: We cannot use short stuff like boost::locale::gettext, because those return
- * std::basic_string objects, which c_ptr()-returned char* is no more valid
+/* NOTE: We cannot use short stuff like `boost::locale::gettext`, because those return
+ * `std::basic_string` objects, which c_ptr()-returned char* is no more valid
  * once deleted (which happens as soons they are out of scope of this func). */
 typedef boost::locale::message_format<char> char_message_facet;
 static std::locale locale_global;
@@ -63,7 +63,7 @@ static void bl_locale_global_cache()
 
 void bl_locale_init(const char *_messages_path, const char *_default_domain)
 {
-  // Avoid using ICU backend, we do not need its power and it's rather heavy!
+  /* Avoid using ICU backend, we do not need its power and it's rather heavy! */
   boost::locale::localization_backend_manager lman =
       boost::locale::localization_backend_manager::global();
 #if defined(_WIN32)
@@ -81,7 +81,7 @@ void bl_locale_set(const char *locale)
 {
   boost::locale::generator gen;
   std::locale _locale;
-  // Specify location of dictionaries.
+  /* Specify location of dictionaries. */
   gen.add_messages_path(messages_path);
   gen.add_messages_domain(default_domain);
   // gen.set_default_messages_domain(default_domain);
@@ -99,12 +99,12 @@ void bl_locale_set(const char *locale)
 #endif
     }
     std::locale::global(_locale);
-    // Note: boost always uses "C" LC_NUMERIC by default!
+    /* NOTE: boost always uses "C" LC_NUMERIC by default! */
 
     bl_locale_global_cache();
 
-    // Generate the locale string
-    // (useful to know which locale we are actually using in case of "default" one).
+    /* Generate the locale string
+     * (useful to know which locale we are actually using in case of "default" one). */
 #define LOCALE_INFO std::use_facet<boost::locale::info>(_locale)
 
     locale_str = LOCALE_INFO.language();
@@ -117,10 +117,9 @@ void bl_locale_set(const char *locale)
 
 #undef LOCALE_INFO
   }
-  // Extra catch on `std::runtime_error` is needed for macOS/Clang as it seems that exceptions
-  // like `boost::locale::conv::conversion_error` (which inherit from `std::runtime_error`) are
-  // not caught by their ancestor `std::exception`. See
-  // https://developer.blender.org/T88877#1177108 .
+  /* Extra catch on `std::runtime_error` is needed for macOS/Clang as it seems that exceptions
+   * like `boost::locale::conv::conversion_error` (which inherit from `std::runtime_error`) are
+   * not caught by their ancestor `std::exception`. See T88877#1177108 */
   catch (std::runtime_error const &e) {
     std::cout << "bl_locale_set(" << locale << "): " << e.what() << " \n";
   }
diff --git a/intern/sky/include/sky_model.h b/intern/sky/include/sky_model.h
index 983b90fed35..752b5c13785 100644
--- a/intern/sky/include/sky_model.h
+++ b/intern/sky/include/sky_model.h
@@ -133,7 +133,7 @@ function which generates skydome states for different solar emission spectra
 and solar radii: 'arhosekskymodelstate_alienworld_alloc_init()'.
 
 See the notes about the "Alien World" functionality provided further down for a
-discussion of the usefulness and limits of that second initalisation function.
+discussion of the usefulness and limits of that second initialisation function.
 Sky model states that have been initialized with either function behave in a
 completely identical fashion during use and cleanup.
 
@@ -368,7 +368,7 @@ SKY_ArHosekSkyModelState *SKY_arhosekskymodelstate_alloc_init(const double solar
     with a sun of a surface temperature given in 'kelvin'. The parameter
     'solar_intensity' controls the overall brightness of the sky, relative
     to the solar irradiance on Earth. A value of 1.0 yields a sky dome that
-    is, on average over the wavelenghts covered in the model (!), as bright
+    is, on average over the wavelengths covered in the model (!), as bright
     as the terrestrial sky in radiometric terms.
 
     Which means that the solar radius has to be adjusted, since the