24 files changed, 1502 insertions, 95 deletions
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h
index de916be24e7..4e30563e21b 100644
--- a/intern/cycles/kernel/integrator/integrator_init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -16,11 +16,14 @@
 
 #pragma once
 
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_camera.h"
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_random.h"
+#include "kernel/camera/camera.h"
+
+#include "kernel/film/accumulate.h"
+#include "kernel/film/adaptive_sampling.h"
+
+#include "kernel/integrator/path_state.h"
+
+#include "kernel/sample/pattern.h"
 
 #include "kernel/geom/geom.h"
 
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h
index 5bab6b2e2fd..f0ba77bd9a6 100644
--- a/intern/cycles/kernel/integrator/integrator_init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -16,12 +16,15 @@
 
 #pragma once
 
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_camera.h"
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_shadow_catcher.h"
+#include "kernel/camera/camera.h"
+
+#include "kernel/film/accumulate.h"
+#include "kernel/film/adaptive_sampling.h"
+
+#include "kernel/integrator/path_state.h"
+#include "kernel/integrator/shadow_catcher.h"
+
+#include "kernel/sample/pattern.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h
index c1315d48694..7fb88fc2804 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -16,11 +16,14 @@
 
 #pragma once
 
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_shadow_catcher.h"
+#include "kernel/camera/projection.h"
+
+#include "kernel/integrator/path_state.h"
+#include "kernel/integrator/shadow_catcher.h"
+
+#include "kernel/light/light.h"
+
+#include "kernel/util/differential.h"
 
 #include "kernel/geom/geom.h"
 
@@ -156,9 +159,11 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState s
   if (path_state_ao_bounce(kg, state)) {
     ray.t = kernel_data.integrator.ao_bounces_distance;
 
-    const float object_ao_distance = kernel_tex_fetch(__objects, last_isect_object).ao_distance;
-    if (object_ao_distance != 0.0f) {
-      ray.t = object_ao_distance;
+    if (last_isect_object != OBJECT_NONE) {
+      const float object_ao_distance = kernel_tex_fetch(__objects, last_isect_object).ao_distance;
+      if (object_ao_distance != 0.0f) {
+        ray.t = object_ao_distance;
+      }
     }
   }
 
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/intersect_shadow.h
index 90422445fad..90422445fad 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/intersect_shadow.h
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h b/intern/cycles/kernel/integrator/intersect_subsurface.h
index b575e7fd1e6..27b8e1e5f5a 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
+++ b/intern/cycles/kernel/integrator/intersect_subsurface.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "kernel/integrator/integrator_subsurface.h"
+#include "kernel/integrator/subsurface.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/intersect_volume_stack.h
index 7def3e2f3f3..1c91318ff9c 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -18,8 +18,8 @@
 
 #include "kernel/bvh/bvh.h"
 #include "kernel/geom/geom.h"
-#include "kernel/integrator/integrator_volume_stack.h"
-#include "kernel/kernel_shader.h"
+#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/volume_stack.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/megakernel.h
index 21a483a792b..d8cc794dc7a 100644
--- a/intern/cycles/kernel/integrator/integrator_megakernel.h
+++ b/intern/cycles/kernel/integrator/megakernel.h
@@ -16,16 +16,16 @@
 
 #pragma once
 
-#include "kernel/integrator/integrator_init_from_camera.h"
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_intersect_shadow.h"
-#include "kernel/integrator/integrator_intersect_subsurface.h"
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
-#include "kernel/integrator/integrator_shade_background.h"
-#include "kernel/integrator/integrator_shade_light.h"
-#include "kernel/integrator/integrator_shade_shadow.h"
-#include "kernel/integrator/integrator_shade_surface.h"
-#include "kernel/integrator/integrator_shade_volume.h"
+#include "kernel/integrator/init_from_camera.h"
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/intersect_shadow.h"
+#include "kernel/integrator/intersect_subsurface.h"
+#include "kernel/integrator/intersect_volume_stack.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_surface.h"
+#include "kernel/integrator/shade_volume.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/path_state.h b/intern/cycles/kernel/integrator/path_state.h
new file mode 100644
index 00000000000..8311b97dedb
--- /dev/null
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/sample/pattern.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Initialize queues, so that the this path is considered terminated.
+ * Used for early outputs in the camera ray initialization, as well as initialization of split
+ * states for shadow catcher. */
+ccl_device_inline void path_state_init_queues(IntegratorState state)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+#ifdef __KERNEL_CPU__
+  INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
+#endif
+}
+
+/* Minimalistic initialization of the path state, which is needed for early outputs in the
+ * integrator initialization to work. */
+ccl_device_inline void path_state_init(IntegratorState state,
+                                       ccl_global const KernelWorkTile *ccl_restrict tile,
+                                       const int x,
+                                       const int y)
+{
+  const uint render_pixel_index = (uint)tile->offset + x + y * tile->stride;
+
+  INTEGRATOR_STATE_WRITE(state, path, render_pixel_index) = render_pixel_index;
+
+  path_state_init_queues(state);
+}
+
+/* Initialize the rest of the path state needed to continue the path integration. */
+ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
+                                                  IntegratorState state,
+                                                  const int sample,
+                                                  const uint rng_hash)
+{
+  INTEGRATOR_STATE_WRITE(state, path, sample) = sample;
+  INTEGRATOR_STATE_WRITE(state, path, bounce) = 0;
+  INTEGRATOR_STATE_WRITE(state, path, diffuse_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(state, path, glossy_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(state, path, transparent_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(state, path, volume_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(state, path, volume_bounds_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(state, path, rng_hash) = rng_hash;
+  INTEGRATOR_STATE_WRITE(state, path, rng_offset) = PRNG_BASE_NUM;
+  INTEGRATOR_STATE_WRITE(state, path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
+                                              PATH_RAY_TRANSPARENT_BACKGROUND;
+  INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, 0, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(
+        state, volume_stack, 0, shader) = kernel_data.background.volume_shader;
+    INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, 1, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(state, volume_stack, 1, shader) = SHADER_NONE;
+  }
+
+#ifdef __DENOISING_FEATURES__
+  if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
+    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_DENOISING_FEATURES;
+    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) = one_float3();
+  }
+#endif
+}
+
+ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state, int label)
+{
+  uint32_t flag = INTEGRATOR_STATE(state, path, flag);
+
+  /* ray through transparent keeps same flags from previous ray and is
+   * not counted as a regular bounce, transparent has separate max */
+  if (label & LABEL_TRANSPARENT) {
+    uint32_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce) + 1;
+
+    flag |= PATH_RAY_TRANSPARENT;
+    if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+      flag |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
+    }
+
+    if (!kernel_data.integrator.transparent_shadows)
+      flag |= PATH_RAY_MIS_SKIP;
+
+    INTEGRATOR_STATE_WRITE(state, path, flag) = flag;
+    INTEGRATOR_STATE_WRITE(state, path, transparent_bounce) = transparent_bounce;
+    /* Random number generator next bounce. */
+    INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;
+    return;
+  }
+
+  uint32_t bounce = INTEGRATOR_STATE(state, path, bounce) + 1;
+  if (bounce >= kernel_data.integrator.max_bounce) {
+    flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+  }
+
+  flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
+
+#ifdef __VOLUME__
+  if (label & LABEL_VOLUME_SCATTER) {
+    /* volume scatter */
+    flag |= PATH_RAY_VOLUME_SCATTER;
+    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    if (bounce == 1) {
+      flag |= PATH_RAY_VOLUME_PASS;
+    }
+
+    const int volume_bounce = INTEGRATOR_STATE(state, path, volume_bounce) + 1;
+    INTEGRATOR_STATE_WRITE(state, path, volume_bounce) = volume_bounce;
+    if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+      flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    }
+  }
+  else
+#endif
+  {
+    /* surface reflection/transmission */
+    if (label & LABEL_REFLECT) {
+      flag |= PATH_RAY_REFLECT;
+      flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+
+      if (label & LABEL_DIFFUSE) {
+        const int diffuse_bounce = INTEGRATOR_STATE(state, path, diffuse_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(state, path, diffuse_bounce) = diffuse_bounce;
+        if (diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        }
+      }
+      else {
+        const int glossy_bounce = INTEGRATOR_STATE(state, path, glossy_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(state, path, glossy_bounce) = glossy_bounce;
+        if (glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        }
+      }
+    }
+    else {
+      kernel_assert(label & LABEL_TRANSMIT);
+
+      flag |= PATH_RAY_TRANSMIT;
+
+      if (!(label & LABEL_TRANSMIT_TRANSPARENT)) {
+        flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+      }
+
+      const int transmission_bounce = INTEGRATOR_STATE(state, path, transmission_bounce) + 1;
+      INTEGRATOR_STATE_WRITE(state, path, transmission_bounce) = transmission_bounce;
+      if (transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
+        flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+      }
+    }
+
+    /* diffuse/glossy/singular */
+    if (label & LABEL_DIFFUSE) {
+      flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
+    }
+    else if (label & LABEL_GLOSSY) {
+      flag |= PATH_RAY_GLOSSY;
+    }
+    else {
+      kernel_assert(label & LABEL_SINGULAR);
+      flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+    }
+
+    /* Render pass categories. */
+    if (bounce == 1) {
+      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+    }
+  }
+
+  INTEGRATOR_STATE_WRITE(state, path, flag) = flag;
+  INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce;
+
+  /* Random number generator next bounce. */
+  INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;
+}
+
+#ifdef __VOLUME__
+ccl_device_inline bool path_state_volume_next(IntegratorState state)
+{
+  /* For volume bounding meshes we pass through without counting transparent
+   * bounces, only sanity check in case self intersection gets us stuck. */
+  uint32_t volume_bounds_bounce = INTEGRATOR_STATE(state, path, volume_bounds_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(state, path, volume_bounds_bounce) = volume_bounds_bounce;
+  if (volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
+    return false;
+  }
+
+  /* Random number generator next bounce. */
+  if (volume_bounds_bounce > 1) {
+    INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;
+  }
+
+  return true;
+}
+#endif
+
+ccl_device_inline uint path_state_ray_visibility(ConstIntegratorState state)
+{
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+
+  uint32_t visibility = path_flag & PATH_RAY_ALL_VISIBILITY;
+
+  /* For visibility, diffuse/glossy are for reflection only. */
+  if (visibility & PATH_RAY_TRANSMIT) {
+    visibility &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
+  }
+
+  /* todo: this is not supported as its own ray visibility yet. */
+  if (path_flag & PATH_RAY_VOLUME_SCATTER) {
+    visibility |= PATH_RAY_DIFFUSE;
+  }
+
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+
+  return visibility;
+}
+
+ccl_device_inline float path_state_continuation_probability(KernelGlobals kg,
+                                                            ConstIntegratorState state,
+                                                            const uint32_t path_flag)
+{
+  if (path_flag & PATH_RAY_TRANSPARENT) {
+    const uint32_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
+    /* Do at least specified number of bounces without RR. */
+    if (transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
+      return 1.0f;
+    }
+  }
+  else {
+    const uint32_t bounce = INTEGRATOR_STATE(state, path, bounce);
+    /* Do at least specified number of bounces without RR. */
+    if (bounce <= kernel_data.integrator.min_bounce) {
+      return 1.0f;
+    }
+  }
+
+  /* Probabilistic termination: use sqrt() to roughly match typical view
+   * transform and do path termination a bit later on average. */
+  return min(sqrtf(max3(fabs(INTEGRATOR_STATE(state, path, throughput)))), 1.0f);
+}
+
+ccl_device_inline bool path_state_ao_bounce(KernelGlobals kg, ConstIntegratorState state)
+{
+  if (!kernel_data.integrator.ao_bounces) {
+    return false;
+  }
+
+  const int bounce = INTEGRATOR_STATE(state, path, bounce) -
+                     INTEGRATOR_STATE(state, path, transmission_bounce) -
+                     (INTEGRATOR_STATE(state, path, glossy_bounce) > 0) + 1;
+  return (bounce > kernel_data.integrator.ao_bounces);
+}
+
+/* Random Number Sampling Utility Functions
+ *
+ * For each random number in each step of the path we must have a unique
+ * dimension to avoid using the same sequence twice.
+ *
+ * For branches in the path we must be careful not to reuse the same number
+ * in a sequence and offset accordingly.
+ */
+
+/* RNG State loaded onto stack. */
+typedef struct RNGState {
+  uint rng_hash;
+  uint rng_offset;
+  int sample;
+} RNGState;
+
+ccl_device_inline void path_state_rng_load(ConstIntegratorState state,
+                                           ccl_private RNGState *rng_state)
+{
+  rng_state->rng_hash = INTEGRATOR_STATE(state, path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(state, path, rng_offset);
+  rng_state->sample = INTEGRATOR_STATE(state, path, sample);
+}
+
+ccl_device_inline void shadow_path_state_rng_load(ConstIntegratorShadowState state,
+                                                  ccl_private RNGState *rng_state)
+{
+  rng_state->rng_hash = INTEGRATOR_STATE(state, shadow_path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(state, shadow_path, rng_offset);
+  rng_state->sample = INTEGRATOR_STATE(state, shadow_path, sample);
+}
+
+ccl_device_inline float path_state_rng_1D(KernelGlobals kg,
+                                          ccl_private const RNGState *rng_state,
+                                          int dimension)
+{
+  return path_rng_1D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(KernelGlobals kg,
+                                         ccl_private const RNGState *rng_state,
+                                         int dimension,
+                                         ccl_private float *fx,
+                                         ccl_private float *fy)
+{
+  path_rng_2D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
+}
+
+ccl_device_inline float path_state_rng_1D_hash(KernelGlobals kg,
+                                               ccl_private const RNGState *rng_state,
+                                               uint hash)
+{
+  /* Use a hash instead of dimension, this is not great but avoids adding
+   * more dimensions to each bounce which reduces quality of dimensions we
+   * are already using. */
+  return path_rng_1D(
+      kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+}
+
+ccl_device_inline float path_branched_rng_1D(KernelGlobals kg,
+                                             ccl_private const RNGState *rng_state,
+                                             int branch,
+                                             int num_branches,
+                                             int dimension)
+{
+  return path_rng_1D(kg,
+                     rng_state->rng_hash,
+                     rng_state->sample * num_branches + branch,
+                     rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(KernelGlobals kg,
+                                            ccl_private const RNGState *rng_state,
+                                            int branch,
+                                            int num_branches,
+                                            int dimension,
+                                            ccl_private float *fx,
+                                            ccl_private float *fy)
+{
+  path_rng_2D(kg,
+              rng_state->rng_hash,
+              rng_state->sample * num_branches + branch,
+              rng_state->rng_offset + dimension,
+              fx,
+              fy);
+}
+
+/* Utility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(KernelGlobals kg,
+                                                         ccl_private const RNGState *state)
+{
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
+  }
+  return 0.0f;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/shade_background.h
index 287c54d7243..71a590749bd 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_shader.h"
+#include "kernel/film/accumulate.h"
+#include "kernel/integrator/shader_eval.h"
+#include "kernel/light/light.h"
+#include "kernel/light/sample.h"
+#include "kernel/sample/mis.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_light.h b/intern/cycles/kernel/integrator/shade_light.h
index 4f0f5a39756..7dad3b4e49d 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_shader.h"
+#include "kernel/film/accumulate.h"
+#include "kernel/integrator/shader_eval.h"
+#include "kernel/light/light.h"
+#include "kernel/light/sample.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h
index a82254e1dea..1de890aae29 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -16,10 +16,9 @@
 
 #pragma once
 
-#include "kernel/integrator/integrator_shade_volume.h"
-#include "kernel/integrator/integrator_volume_stack.h"
-
-#include "kernel/kernel_shader.h"
+#include "kernel/integrator/shade_volume.h"
+#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/volume_stack.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 3724b05c6b0..cce591eb219 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -16,15 +16,18 @@
 
 #pragma once
 
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shader.h"
+#include "kernel/film/accumulate.h"
+#include "kernel/film/passes.h"
 
-#include "kernel/integrator/integrator_subsurface.h"
-#include "kernel/integrator/integrator_volume_stack.h"
+#include "kernel/integrator/path_state.h"
+#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/subsurface.h"
+#include "kernel/integrator/volume_stack.h"
+
+#include "kernel/light/light.h"
+#include "kernel/light/sample.h"
+
+#include "kernel/sample/mis.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -325,17 +328,25 @@ ccl_device_forceinline bool integrate_surface_volume_only_bounce(IntegratorState
 #endif
 
 #if defined(__AO__)
-ccl_device_forceinline void integrate_surface_ao_pass(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const ShaderData *ccl_restrict sd,
-    ccl_private const RNGState *ccl_restrict rng_state,
-    ccl_global float *ccl_restrict render_buffer)
+ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 ccl_private const ShaderData *ccl_restrict sd,
+                                                 ccl_private const RNGState *ccl_restrict
+                                                     rng_state,
+                                                 ccl_global float *ccl_restrict render_buffer)
 {
+  if (!(kernel_data.kernel_features & KERNEL_FEATURE_AO_ADDITIVE) &&
+      !(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
+    return;
+  }
+
   float bsdf_u, bsdf_v;
   path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
-  const float3 ao_N = shader_bsdf_ao_normal(kg, sd);
+  float3 ao_N;
+  const float3 ao_weight = shader_bsdf_ao(
+      kg, sd, kernel_data.integrator.ao_additive_factor, &ao_N);
+
   float3 ao_D;
   float ao_pdf;
   sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
@@ -379,6 +390,10 @@ ccl_device_forceinline void integrate_surface_ao_pass(
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_AO_ADDITIVE) {
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unshadowed_throughput) = ao_weight;
+  }
 }
 #endif /* defined(__AO__) */
 
@@ -487,10 +502,9 @@ ccl_device bool integrate_surface(KernelGlobals kg,
 
 #if defined(__AO__)
     /* Ambient occlusion pass. */
-    if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
-        (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_CAMERA)) {
+    if (kernel_data.kernel_features & KERNEL_FEATURE_AO) {
       PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
-      integrate_surface_ao_pass(kg, state, &sd, &rng_state, render_buffer);
+      integrate_surface_ao(kg, state, &sd, &rng_state, render_buffer);
     }
 #endif
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index d0aabb550c0..f455152dcf9 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -16,15 +16,18 @@
 
 #pragma once
 
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_passes.h"
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shader.h"
-
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_volume_stack.h"
+#include "kernel/film/accumulate.h"
+#include "kernel/film/passes.h"
+
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/path_state.h"
+#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/volume_stack.h"
+
+#include "kernel/light/light.h"
+#include "kernel/light/sample.h"
+
+#include "kernel/sample/mis.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/shader_eval.h b/intern/cycles/kernel/integrator/shader_eval.h
new file mode 100644
index 00000000000..68f1ef8c118
--- /dev/null
+++ b/intern/cycles/kernel/integrator/shader_eval.h
@@ -0,0 +1,869 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Functions to evaluate shaders and use the resulting shader closures. */
+
+#pragma once
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/emissive.h"
+
+#include "kernel/film/accumulate.h"
+
+#include "kernel/svm/svm.h"
+
+#ifdef __OSL__
+#  include "kernel/osl/shader.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Merging */
+
+#if defined(__VOLUME__)
+ccl_device_inline void shader_merge_volume_closures(ccl_private ShaderData *sd)
+{
+  /* Merge identical closures to save closure space with stacked volumes. */
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private ShaderClosure *sci = &sd->closure[i];
+
+    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      continue;
+    }
+
+    for (int j = i + 1; j < sd->num_closure; j++) {
+      ccl_private ShaderClosure *scj = &sd->closure[j];
+      if (sci->type != scj->type) {
+        continue;
+      }
+
+      ccl_private const HenyeyGreensteinVolume *hgi = (ccl_private const HenyeyGreensteinVolume *)
+          sci;
+      ccl_private const HenyeyGreensteinVolume *hgj = (ccl_private const HenyeyGreensteinVolume *)
+          scj;
+      if (!(hgi->g == hgj->g)) {
+        continue;
+      }
+
+      sci->weight += scj->weight;
+      sci->sample_weight += scj->sample_weight;
+
+      int size = sd->num_closure - (j + 1);
+      if (size > 0) {
+        for (int k = 0; k < size; k++) {
+          scj[k] = scj[k + 1];
+        }
+      }
+
+      sd->num_closure--;
+      kernel_assert(sd->num_closure >= 0);
+      j--;
+    }
+  }
+}
+
+ccl_device_inline void shader_copy_volume_phases(ccl_private ShaderVolumePhases *ccl_restrict
+                                                     phases,
+                                                 ccl_private const ShaderData *ccl_restrict sd)
+{
+  phases->num_closure = 0;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *from_sc = &sd->closure[i];
+    ccl_private const HenyeyGreensteinVolume *from_hg =
+        (ccl_private const HenyeyGreensteinVolume *)from_sc;
+
+    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      ccl_private ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+      to_sc->weight = from_sc->weight;
+      to_sc->sample_weight = from_sc->sample_weight;
+      to_sc->g = from_hg->g;
+      phases->num_closure++;
+      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+        break;
+      }
+    }
+  }
+}
+#endif /* __VOLUME__ */
+
+ccl_device_inline void shader_prepare_surface_closures(KernelGlobals kg,
+                                                       ConstIntegratorState state,
+                                                       ccl_private ShaderData *sd)
+{
+  /* Defensive sampling.
+   *
+   * We can likely also do defensive sampling at deeper bounces, particularly
+   * for cases like a perfect mirror but possibly also others. This will need
+   * a good heuristic. */
+  if (INTEGRATOR_STATE(state, path, bounce) + INTEGRATOR_STATE(state, path, transparent_bounce) ==
+          0 &&
+      sd->num_closure > 1) {
+    float sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+      }
+    }
+  }
+
+  /* Filter glossy.
+   *
+   * Blurring of bsdf after bounces, for rays that have a small likelihood
+   * of following this particular path (diffuse, rough glossy) */
+  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
+    float blur_pdf = kernel_data.integrator.filter_glossy *
+                     INTEGRATOR_STATE(state, path, min_ray_pdf);
+
+    if (blur_pdf < 1.0f) {
+      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+        if (CLOSURE_IS_BSDF(sc->type)) {
+          bsdf_blur(kg, sc, blur_roughness);
+        }
+      }
+    }
+  }
+}
+
+/* BSDF */
+
+ccl_device_inline bool shader_bsdf_is_transmission(ccl_private const ShaderData *sd,
+                                                   const float3 omega_in)
+{
+  return dot(sd->N, omega_in) < 0.0f;
+}
+
+ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
+{
+  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+    return false;
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+    if (CLOSURE_IS_BSDF_DIFFUSE(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ccl_device_inline float _shader_bsdf_multi_eval(KernelGlobals kg,
+                                                ccl_private ShaderData *sd,
+                                                const float3 omega_in,
+                                                const bool is_transmission,
+                                                ccl_private const ShaderClosure *skip_sc,
+                                                ccl_private BsdfEval *result_eval,
+                                                float sum_pdf,
+                                                float sum_sample_weight,
+                                                const uint light_shader_flags)
+{
+  /* This is the veach one-sample model with balance heuristic,
+   * some PDF factors drop out when using balance heuristic weighting. */
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (sc == skip_sc) {
+      continue;
+    }
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
+
+        if (bsdf_pdf != 0.0f) {
+          const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
+          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+        }
+      }
+
+      sum_sample_weight += sc->sample_weight;
+    }
+  }
+
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+}
+
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_inline
+#endif
+    float
+    shader_bsdf_eval(KernelGlobals kg,
+                     ccl_private ShaderData *sd,
+                     const float3 omega_in,
+                     const bool is_transmission,
+                     ccl_private BsdfEval *bsdf_eval,
+                     const uint light_shader_flags)
+{
+  bsdf_eval_init(bsdf_eval, false, zero_float3());
+
+  return _shader_bsdf_multi_eval(
+      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
+}
+
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline ccl_private const ShaderClosure *shader_bsdf_bssrdf_pick(
+    ccl_private const ShaderData *ccl_restrict sd, ccl_private float *randu)
+{
+  int sampled = 0;
+
+  if (sd->num_closure > 1) {
+    /* Pick a BSDF or based on sample weights. */
+    float sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+
+    float r = (*randu) * sum;
+    float partial_sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        float next_sum = partial_sum + sc->sample_weight;
+
+        if (r < next_sum) {
+          sampled = i;
+
+          /* Rescale to reuse for direction sample, to better preserve stratification. */
+          *randu = (r - partial_sum) / sc->sample_weight;
+          break;
+        }
+
+        partial_sum = next_sum;
+      }
+    }
+  }
+
+  return &sd->closure[sampled];
+}
+
+/* Return weight for picked BSSRDF. */
+ccl_device_inline float3
+shader_bssrdf_sample_weight(ccl_private const ShaderData *ccl_restrict sd,
+                            ccl_private const ShaderClosure *ccl_restrict bssrdf_sc)
+{
+  float3 weight = bssrdf_sc->weight;
+
+  if (sd->num_closure > 1) {
+    float sum = 0.0f;
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+    weight *= sum / bssrdf_sc->sample_weight;
+  }
+
+  return weight;
+}
+
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int shader_bsdf_sample_closure(KernelGlobals kg,
+                                          ccl_private ShaderData *sd,
+                                          ccl_private const ShaderClosure *sc,
+                                          float randu,
+                                          float randv,
+                                          ccl_private BsdfEval *bsdf_eval,
+                                          ccl_private float3 *omega_in,
+                                          ccl_private differential3 *domega_in,
+                                          ccl_private float *pdf)
+{
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
+
+  int label;
+  float3 eval = zero_float3();
+
+  *pdf = 0.0f;
+  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+
+  if (*pdf != 0.0f) {
+    const bool is_diffuse = CLOSURE_IS_BSDF_DIFFUSE(sc->type);
+    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);
+
+    if (sd->num_closure > 1) {
+      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
+      float sweight = sc->sample_weight;
+      *pdf = _shader_bsdf_multi_eval(
+          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+    }
+  }
+
+  return label;
+}
+
+ccl_device float shader_bsdf_average_roughness(ccl_private const ShaderData *sd)
+{
+  float roughness = 0.0f;
+  float sum_weight = 0.0f;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF(sc->type)) {
+      /* sqrt once to undo the squaring from multiplying roughness on the
+       * two axes, and once for the squared roughness convention. */
+      float weight = fabsf(average(sc->weight));
+      roughness += weight * sqrtf(safe_sqrtf(bsdf_get_roughness_squared(sc)));
+      sum_weight += weight;
+    }
+  }
+
+  return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
+}
+
+ccl_device float3 shader_bsdf_transparency(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_HAS_ONLY_VOLUME) {
+    return one_float3();
+  }
+  else if (sd->flag & SD_TRANSPARENT) {
+    return sd->closure_transparent_extinction;
+  }
+  else {
+    return zero_float3();
+  }
+}
+
+ccl_device void shader_bsdf_disable_transparency(KernelGlobals kg, ccl_private ShaderData *sd)
+{
+  if (sd->flag & SD_TRANSPARENT) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+
+      if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+        sc->sample_weight = 0.0f;
+        sc->weight = zero_float3();
+      }
+    }
+
+    sd->flag &= ~SD_TRANSPARENT;
+  }
+}
+
+ccl_device float3 shader_bsdf_alpha(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
+
+  alpha = max(alpha, zero_float3());
+  alpha = min(alpha, one_float3());
+
+  return alpha;
+}
+
+ccl_device float3 shader_bsdf_diffuse(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  float3 eval = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device float3 shader_bsdf_glossy(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  float3 eval = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device float3 shader_bsdf_transmission(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  float3 eval = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device float3 shader_bsdf_average_normal(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+      N += sc->N * fabsf(average(sc->weight));
+  }
+
+  return (is_zero(N)) ? sd->N : normalize(N);
+}
+
+ccl_device float3 shader_bsdf_ao(KernelGlobals kg,
+                                 ccl_private const ShaderData *sd,
+                                 const float ao_factor,
+                                 ccl_private float3 *N_)
+{
+  float3 eval = zero_float3();
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+      ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
+      eval += sc->weight * ao_factor;
+      N += bsdf->N * fabsf(average(sc->weight));
+    }
+  }
+
+  *N_ = (is_zero(N)) ? sd->N : normalize(N);
+  return eval;
+}
+
+#ifdef __SUBSURFACE__
+ccl_device float3 shader_bssrdf_normal(ccl_private const ShaderData *sd)
+{
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSSRDF(sc->type)) {
+      ccl_private const Bssrdf *bssrdf = (ccl_private const Bssrdf *)sc;
+      float avg_weight = fabsf(average(sc->weight));
+
+      N += bssrdf->N * avg_weight;
+    }
+  }
+
+  return (is_zero(N)) ? sd->N : normalize(N);
+}
+#endif /* __SUBSURFACE__ */
+
+/* Constant emission optimization */
+
+ccl_device bool shader_constant_emission_eval(KernelGlobals kg,
+                                              int shader,
+                                              ccl_private float3 *eval)
+{
+  int shader_index = shader & SHADER_MASK;
+  int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
+
+  if (shader_flag & SD_HAS_CONSTANT_EMISSION) {
+    *eval = make_float3(kernel_tex_fetch(__shaders, shader_index).constant_emission[0],
+                        kernel_tex_fetch(__shaders, shader_index).constant_emission[1],
+                        kernel_tex_fetch(__shaders, shader_index).constant_emission[2]);
+
+    return true;
+  }
+
+  return false;
+}
+
+/* Background */
+
+ccl_device float3 shader_background_eval(ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_EMISSION) {
+    return sd->closure_emission_background;
+  }
+  else {
+    return zero_float3();
+  }
+}
+
+/* Emission */
+
+ccl_device float3 shader_emissive_eval(ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_EMISSION) {
+    return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
+  }
+  else {
+    return zero_float3();
+  }
+}
+
+/* Holdout */
+
+ccl_device float3 shader_holdout_apply(KernelGlobals kg, ccl_private ShaderData *sd)
+{
+  float3 weight = zero_float3();
+
+  /* For objects marked as holdout, preserve transparency and remove all other
+   * closures, replacing them with a holdout weight. */
+  if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    if ((sd->flag & SD_TRANSPARENT) && !(sd->flag & SD_HAS_ONLY_VOLUME)) {
+      weight = one_float3() - sd->closure_transparent_extinction;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+        if (!CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+          sc->type = NBUILTIN_CLOSURES;
+        }
+      }
+
+      sd->flag &= ~(SD_CLOSURE_FLAGS - (SD_TRANSPARENT | SD_BSDF));
+    }
+    else {
+      weight = one_float3();
+    }
+  }
+  else {
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_HOLDOUT(sc->type)) {
+        weight += sc->weight;
+      }
+    }
+  }
+
+  return weight;
+}
+
+/* Surface Evaluation */
+
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
+ccl_device void shader_eval_surface(KernelGlobals kg,
+                                    ConstIntegratorGenericState state,
+                                    ccl_private ShaderData *ccl_restrict sd,
+                                    ccl_global float *ccl_restrict buffer,
+                                    uint32_t path_flag)
+{
+  /* If path is being terminated, we are tracing a shadow ray or evaluating
+   * emission, then we don't need to store closures. The emission and shadow
+   * shader data also do not have a closure array to save GPU memory. */
+  int max_closures;
+  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+    max_closures = 0;
+  }
+  else {
+    max_closures = kernel_data.max_closures;
+  }
+
+  sd->num_closure = 0;
+  sd->num_closure_left = max_closures;
+
+#ifdef __OSL__
+  if (kg->osl) {
+    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+      OSLShader::eval_background(kg, state, sd, path_flag);
+    }
+    else {
+      OSLShader::eval_surface(kg, state, sd, path_flag);
+    }
+  }
+  else
+#endif
+  {
+#ifdef __SVM__
+    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(kg, state, sd, buffer, path_flag);
+#else
+    if (sd->object == OBJECT_NONE) {
+      sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
+      sd->flag |= SD_EMISSION;
+    }
+    else {
+      ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
+          sd, sizeof(DiffuseBsdf), make_float3(0.8f, 0.8f, 0.8f));
+      if (bsdf != NULL) {
+        bsdf->N = sd->N;
+        sd->flag |= bsdf_diffuse_setup(bsdf);
+      }
+    }
+#endif
+  }
+}
+
+/* Volume */
+
+#ifdef __VOLUME__
+
+ccl_device_inline float _shader_volume_phase_multi_eval(
+    ccl_private const ShaderData *sd,
+    ccl_private const ShaderVolumePhases *phases,
+    const float3 omega_in,
+    int skip_phase,
+    ccl_private BsdfEval *result_eval,
+    float sum_pdf,
+    float sum_sample_weight)
+{
+  for (int i = 0; i < phases->num_closure; i++) {
+    if (i == skip_phase)
+      continue;
+
+    ccl_private const ShaderVolumeClosure *svc = &phases->closure[i];
+    float phase_pdf = 0.0f;
+    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
+
+    if (phase_pdf != 0.0f) {
+      bsdf_eval_accum(result_eval, false, eval, 1.0f);
+      sum_pdf += phase_pdf * svc->sample_weight;
+    }
+
+    sum_sample_weight += svc->sample_weight;
+  }
+
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+}
+
+ccl_device float shader_volume_phase_eval(KernelGlobals kg,
+                                          ccl_private const ShaderData *sd,
+                                          ccl_private const ShaderVolumePhases *phases,
+                                          const float3 omega_in,
+                                          ccl_private BsdfEval *phase_eval)
+{
+  bsdf_eval_init(phase_eval, false, zero_float3());
+
+  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
+}
+
+ccl_device int shader_volume_phase_sample(KernelGlobals kg,
+                                          ccl_private const ShaderData *sd,
+                                          ccl_private const ShaderVolumePhases *phases,
+                                          float randu,
+                                          float randv,
+                                          ccl_private BsdfEval *phase_eval,
+                                          ccl_private float3 *omega_in,
+                                          ccl_private differential3 *domega_in,
+                                          ccl_private float *pdf)
+{
+  int sampled = 0;
+
+  if (phases->num_closure > 1) {
+    /* pick a phase closure based on sample weights */
+    float sum = 0.0f;
+
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      sum += svc->sample_weight;
+    }
+
+    float r = randu * sum;
+    float partial_sum = 0.0f;
+
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      float next_sum = partial_sum + svc->sample_weight;
+
+      if (r <= next_sum) {
+        /* Rescale to reuse for BSDF direction sample. */
+        randu = (r - partial_sum) / svc->sample_weight;
+        break;
+      }
+
+      partial_sum = next_sum;
+    }
+
+    if (sampled == phases->num_closure) {
+      *pdf = 0.0f;
+      return LABEL_NONE;
+    }
+  }
+
+  /* todo: this isn't quite correct, we don't weight anisotropy properly
+   * depending on color channels, even if this is perhaps not a common case */
+  ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
+  int label;
+  float3 eval = zero_float3();
+
+  *pdf = 0.0f;
+  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
+
+  if (*pdf != 0.0f) {
+    bsdf_eval_init(phase_eval, false, eval);
+  }
+
+  return label;
+}
+
+ccl_device int shader_phase_sample_closure(KernelGlobals kg,
+                                           ccl_private const ShaderData *sd,
+                                           ccl_private const ShaderVolumeClosure *sc,
+                                           float randu,
+                                           float randv,
+                                           ccl_private BsdfEval *phase_eval,
+                                           ccl_private float3 *omega_in,
+                                           ccl_private differential3 *domega_in,
+                                           ccl_private float *pdf)
+{
+  int label;
+  float3 eval = zero_float3();
+
+  *pdf = 0.0f;
+  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+
+  if (*pdf != 0.0f)
+    bsdf_eval_init(phase_eval, false, eval);
+
+  return label;
+}
+
+/* Volume Evaluation */
+
+template<const bool shadow, typename StackReadOp, typename ConstIntegratorGenericState>
+ccl_device_inline void shader_eval_volume(KernelGlobals kg,
+                                          ConstIntegratorGenericState state,
+                                          ccl_private ShaderData *ccl_restrict sd,
+                                          const uint32_t path_flag,
+                                          StackReadOp stack_read)
+{
+  /* If path is being terminated, we are tracing a shadow ray or evaluating
+   * emission, then we don't need to store closures. The emission and shadow
+   * shader data also do not have a closure array to save GPU memory. */
+  int max_closures;
+  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+    max_closures = 0;
+  }
+  else {
+    max_closures = kernel_data.max_closures;
+  }
+
+  /* reset closures once at the start, we will be accumulating the closures
+   * for all volumes in the stack into a single array of closures */
+  sd->num_closure = 0;
+  sd->num_closure_left = max_closures;
+  sd->flag = 0;
+  sd->object_flag = 0;
+
+  for (int i = 0;; i++) {
+    const VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    /* Setup shader-data from stack. it's mostly setup already in
+     * shader_setup_from_volume, this switching should be quick. */
+    sd->object = entry.object;
+    sd->lamp = LAMP_NONE;
+    sd->shader = entry.shader;
+
+    sd->flag &= ~SD_SHADER_FLAGS;
+    sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+    sd->object_flag &= ~SD_OBJECT_FLAGS;
+
+    if (sd->object != OBJECT_NONE) {
+      sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
+
+#  ifdef __OBJECT_MOTION__
+      /* todo: this is inefficient for motion blur, we should be
+       * caching matrices instead of recomputing them each step */
+      shader_setup_object_transforms(kg, sd, sd->time);
+#  endif
+    }
+
+    /* evaluate shader */
+#  ifdef __SVM__
+#    ifdef __OSL__
+    if (kg->osl) {
+      OSLShader::eval_volume(kg, state, sd, path_flag);
+    }
+    else
+#    endif
+    {
+      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+          kg, state, sd, NULL, path_flag);
+    }
+#  endif
+
+    /* Merge closures to avoid exceeding number of closures limit. */
+    if (!shadow) {
+      if (i > 0) {
+        shader_merge_volume_closures(sd);
+      }
+    }
+  }
+}
+
+#endif /* __VOLUME__ */
+
+/* Displacement Evaluation */
+
+template<typename ConstIntegratorGenericState>
+ccl_device void shader_eval_displacement(KernelGlobals kg,
+                                         ConstIntegratorGenericState state,
+                                         ccl_private ShaderData *sd)
+{
+  sd->num_closure = 0;
+  sd->num_closure_left = 0;
+
+  /* this will modify sd->P */
+#ifdef __SVM__
+#  ifdef __OSL__
+  if (kg->osl)
+    OSLShader::eval_displacement(kg, state, sd);
+  else
+#  endif
+  {
+    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+        kg, state, sd, NULL, 0);
+  }
+#endif
+}
+
+/* Cryptomatte */
+
+ccl_device float shader_cryptomatte_id(KernelGlobals kg, int shader)
+{
+  return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/shadow_catcher.h b/intern/cycles/kernel/integrator/shadow_catcher.h
new file mode 100644
index 00000000000..7beae235dbc
--- /dev/null
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/path_state.h"
+#include "kernel/integrator/state_util.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Check whether current surface bounce is where path is to be split for the shadow catcher. */
+ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals kg,
+                                                                  IntegratorState state,
+                                                                  const int object_flag)
+{
+#ifdef __SHADOW_CATCHER__
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
+
+  /* Check the flag first, avoiding fetches form global memory. */
+  if ((object_flag & SD_OBJECT_SHADOW_CATCHER) == 0) {
+    return false;
+  }
+  if (object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    return false;
+  }
+
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+
+  if ((path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) == 0) {
+    /* Split only on primary rays, secondary bounces are to treat shadow catcher as a regular
+     * object. */
+    return false;
+  }
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    return false;
+  }
+
+  return true;
+#else
+  (void)object_flag;
+  return false;
+#endif
+}
+
+/* Check whether the current path can still split. */
+ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
+                                                            ConstIntegratorState state)
+{
+  if (INTEGRATOR_PATH_IS_TERMINATED) {
+    return false;
+  }
+
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+    /* Shadow catcher was already hit and the state was split. No further split is allowed. */
+    return false;
+  }
+
+  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
+}
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(KernelGlobals kg,
+                                                   IntegratorState state,
+                                                   const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(kg, state, object_flags)) {
+    return false;
+  }
+
+  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+   * shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Split new state from the current one. This new state will only track contribution of shadow
+   * catcher objects ignoring non-catcher objects. */
+  integrator_state_shadow_catcher_split(kg, state);
+
+  return true;
+#else
+  (void)object_flags;
+  return false;
+#endif
+}
+
+#ifdef __SHADOW_CATCHER__
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) == 0;
+}
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(const uint32_t path_flag)
+{
+  return path_flag & PATH_RAY_SHADOW_CATCHER_PASS;
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h
index bc35b644ee1..1fbadde2642 100644
--- a/intern/cycles/kernel/integrator/integrator_shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -42,7 +42,10 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
 KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput for shadow pass. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+KERNEL_STRUCT_MEMBER(shadow_path,
+                     float3,
+                     unshadowed_throughput,
+                     KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
 /* Ratio of throughput to distinguish diffuse and glossy render passes. */
 KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/state.h
index 09b399ff1b8..86dac0a65cf 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -40,9 +40,9 @@
  * INTEGRATOR_STATE_NULL: use to pass empty state to other functions.
  */
 
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
 
-#include "util/util_types.h"
+#include "util/types.h"
 
 #pragma once
 
@@ -64,7 +64,7 @@ typedef struct IntegratorShadowStateCPU {
   } \
   name[cpu_size];
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
-#include "kernel/integrator/integrator_shadow_state_template.h"
+#include "kernel/integrator/shadow_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@@ -83,7 +83,7 @@ typedef struct IntegratorStateCPU {
   } \
   name[cpu_size];
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
-#include "kernel/integrator/integrator_state_template.h"
+#include "kernel/integrator/state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@@ -118,9 +118,9 @@ typedef struct IntegratorStateGPU {
   name[gpu_size];
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
 
-#include "kernel/integrator/integrator_state_template.h"
+#include "kernel/integrator/state_template.h"
 
-#include "kernel/integrator/integrator_shadow_state_template.h"
+#include "kernel/integrator/shadow_state_template.h"
 
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index 1569bf68e24..38a2b396847 100644
--- a/intern/cycles/kernel/integrator/integrator_state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "kernel/kernel_types.h"
-#include "util/util_atomic.h"
+#include "kernel/types.h"
+#include "util/atomic.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/state_template.h
index b1a6fd36fae..b1a6fd36fae 100644
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/state_util.h
index 6e6b7f8a40f..dafe06e7009 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-#include "kernel/integrator/integrator_state.h"
-#include "kernel/kernel_differential.h"
+#include "kernel/integrator/state.h"
+
+#include "kernel/util/differential.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -246,7 +247,7 @@ ccl_device_inline void integrator_state_copy_only(KernelGlobals kg,
 
 #  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
 
-#  include "kernel/integrator/integrator_state_template.h"
+#  include "kernel/integrator/state_template.h"
 
 #  undef KERNEL_STRUCT_BEGIN
 #  undef KERNEL_STRUCT_MEMBER
@@ -302,7 +303,7 @@ ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg,
 
 #  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
 
-#  include "kernel/integrator/integrator_shadow_state_template.h"
+#  include "kernel/integrator/shadow_state_template.h"
 
 #  undef KERNEL_STRUCT_BEGIN
 #  undef KERNEL_STRUCT_MEMBER
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/subsurface.h
index e3bf9db80f7..49466112387 100644
--- a/intern/cycles/kernel/integrator/integrator_subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_shader.h"
+#include "kernel/camera/projection.h"
 
 #include "kernel/bvh/bvh.h"
 
@@ -28,9 +26,11 @@
 #include "kernel/closure/bssrdf.h"
 #include "kernel/closure/volume.h"
 
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
-#include "kernel/integrator/integrator_subsurface_disk.h"
-#include "kernel/integrator/integrator_subsurface_random_walk.h"
+#include "kernel/integrator/intersect_volume_stack.h"
+#include "kernel/integrator/path_state.h"
+#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/subsurface_disk.h"
+#include "kernel/integrator/subsurface_random_walk.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface_disk.h b/intern/cycles/kernel/integrator/subsurface_disk.h
index e1cce13fb30..6146b8c41fc 100644
--- a/intern/cycles/kernel/integrator/integrator_subsurface_disk.h
+++ b/intern/cycles/kernel/integrator/subsurface_disk.h
@@ -119,9 +119,6 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   float sum_weights = 0.0f;
 
   for (int hit = 0; hit < num_eval_hits; hit++) {
-    /* Quickly retrieve P and Ng without setting up ShaderData. */
-    const float3 hit_P = ray.P + ray.D * ss_isect.hits[hit].t;
-
     /* Get geometric normal. */
     const int object = ss_isect.hits[hit].object;
     const int object_flag = kernel_tex_fetch(__object_flag, object);
@@ -131,11 +128,24 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
     }
 
     if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+      /* Transform normal to world space. */
       Transform itfm;
-      object_fetch_transform_motion_test(kg, object, time, &itfm);
+      Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
       hit_Ng = normalize(transform_direction_transposed(&itfm, hit_Ng));
+
+      /* Transform t to world space, except for OptiX where it already is. */
+#ifdef __KERNEL_OPTIX__
+      (void)tfm;
+#else
+      float3 D = transform_direction(&itfm, ray.D);
+      D = normalize(D) * ss_isect.hits[hit].t;
+      ss_isect.hits[hit].t = len(transform_direction(&tfm, D));
+#endif
     }
 
+    /* Quickly retrieve P and Ng without setting up ShaderData. */
+    const float3 hit_P = ray.P + ray.D * ss_isect.hits[hit].t;
+
     /* Probability densities for local frame axes. */
     const float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
     const float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface_random_walk.h b/intern/cycles/kernel/integrator/subsurface_random_walk.h
index 2ab6d0961e3..f0712758174 100644
--- a/intern/cycles/kernel/integrator/integrator_subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_projection.h"
+#include "kernel/camera/projection.h"
 
 #include "kernel/bvh/bvh.h"
 
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/volume_stack.h
index cf69826ffff..cf69826ffff 100644
--- a/intern/cycles/kernel/integrator/integrator_volume_stack.h
+++ b/intern/cycles/kernel/integrator/volume_stack.h