27 files changed, 2755 insertions, 1492 deletions
diff --git a/intern/cycles/kernel/integrator/displacement_shader.h b/intern/cycles/kernel/integrator/displacement_shader.h
new file mode 100644
index 00000000000..839dfe244ac
--- /dev/null
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+/* Functions to evaluate displacement shader. */
+
+#pragma once
+
+#ifdef __SVM__
+#  include "kernel/svm/svm.h"
+#endif
+#ifdef __OSL__
+#  include "kernel/osl/osl.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+template<typename ConstIntegratorGenericState>
+ccl_device void displacement_shader_eval(KernelGlobals kg,
+                                         ConstIntegratorGenericState state,
+                                         ccl_private ShaderData *sd)
+{
+  sd->num_closure = 0;
+  sd->num_closure_left = 0;
+
+  /* this will modify sd->P */
+#ifdef __OSL__
+  if (kg->osl) {
+    OSLShader::eval_displacement(kg, state, sd);
+  }
+  else
+#endif
+  {
+#ifdef __SVM__
+    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+        kg, state, sd, NULL, 0);
+#endif
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/guiding.h b/intern/cycles/kernel/integrator/guiding.h
new file mode 100644
index 00000000000..634bba2a9b4
--- /dev/null
+++ b/intern/cycles/kernel/integrator/guiding.h
@@ -0,0 +1,547 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/film/write.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Utilities. */
+
+#if defined(__PATH_GUIDING__)
+static pgl_vec3f guiding_vec3f(const float3 v)
+{
+  return openpgl::cpp::Vector3(v.x, v.y, v.z);
+}
+
+static pgl_point3f guiding_point3f(const float3 v)
+{
+  return openpgl::cpp::Point3(v.x, v.y, v.z);
+}
+#endif
+
+/* Path recording for guiding. */
+
+/* Record Surface Interactions */
+
+/* Records/Adds a new path segment with the current path vertex on a surface.
+ * If the path is not terminated this call is usually followed by a call of
+ * guiding_record_surface_bounce. */
+ccl_device_forceinline void guiding_record_surface_segment(KernelGlobals kg,
+                                                           IntegratorState state,
+                                                           ccl_private const ShaderData *sd)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+
+  const pgl_vec3f zero = guiding_vec3f(zero_float3());
+  const pgl_vec3f one = guiding_vec3f(one_float3());
+
+  state->guiding.path_segment = kg->opgl_path_segment_storage->NextSegment();
+  openpgl::cpp::SetPosition(state->guiding.path_segment, guiding_point3f(sd->P));
+  openpgl::cpp::SetDirectionOut(state->guiding.path_segment, guiding_vec3f(sd->I));
+  openpgl::cpp::SetVolumeScatter(state->guiding.path_segment, false);
+  openpgl::cpp::SetScatteredContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetDirectContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, one);
+  openpgl::cpp::SetEta(state->guiding.path_segment, 1.0);
+#endif
+}
+
+/* Records the surface scattering event at the current vertex position of the segment.*/
+ccl_device_forceinline void guiding_record_surface_bounce(KernelGlobals kg,
+                                                          IntegratorState state,
+                                                          ccl_private const ShaderData *sd,
+                                                          const Spectrum weight,
+                                                          const float pdf,
+                                                          const float3 N,
+                                                          const float3 omega_in,
+                                                          const float2 roughness,
+                                                          const float eta)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  const float min_roughness = safe_sqrtf(fminf(roughness.x, roughness.y));
+  const bool is_delta = (min_roughness == 0.0f);
+  const float3 weight_rgb = spectrum_to_rgb(weight);
+  const float3 normal = clamp(N, -one_float3(), one_float3());
+
+  kernel_assert(state->guiding.path_segment != nullptr);
+
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, guiding_vec3f(one_float3()));
+  openpgl::cpp::SetVolumeScatter(state->guiding.path_segment, false);
+  openpgl::cpp::SetNormal(state->guiding.path_segment, guiding_vec3f(normal));
+  openpgl::cpp::SetDirectionIn(state->guiding.path_segment, guiding_vec3f(omega_in));
+  openpgl::cpp::SetPDFDirectionIn(state->guiding.path_segment, pdf);
+  openpgl::cpp::SetScatteringWeight(state->guiding.path_segment, guiding_vec3f(weight_rgb));
+  openpgl::cpp::SetIsDelta(state->guiding.path_segment, is_delta);
+  openpgl::cpp::SetEta(state->guiding.path_segment, eta);
+  openpgl::cpp::SetRoughness(state->guiding.path_segment, min_roughness);
+#endif
+}
+
+/* Records the emission at the current surface intersection (physical or virtual) */
+ccl_device_forceinline void guiding_record_surface_emission(KernelGlobals kg,
+                                                            IntegratorState state,
+                                                            const Spectrum Le,
+                                                            const float mis_weight)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  const float3 Le_rgb = spectrum_to_rgb(Le);
+
+  openpgl::cpp::SetDirectContribution(state->guiding.path_segment, guiding_vec3f(Le_rgb));
+  openpgl::cpp::SetMiWeight(state->guiding.path_segment, mis_weight);
+#endif
+}
+
+/* Record BSSRDF Interactions */
+
+/* Records/Adds a new path segment where the vertex position is the point of entry
+ * of the sub surface scattering boundary.
+ * If the path is not terminated this call is usually followed by a call of
+ * guiding_record_bssrdf_weight and guiding_record_bssrdf_bounce. */
+ccl_device_forceinline void guiding_record_bssrdf_segment(KernelGlobals kg,
+                                                          IntegratorState state,
+                                                          const float3 P,
+                                                          const float3 I)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  const pgl_vec3f zero = guiding_vec3f(zero_float3());
+  const pgl_vec3f one = guiding_vec3f(one_float3());
+
+  state->guiding.path_segment = kg->opgl_path_segment_storage->NextSegment();
+  openpgl::cpp::SetPosition(state->guiding.path_segment, guiding_point3f(P));
+  openpgl::cpp::SetDirectionOut(state->guiding.path_segment, guiding_vec3f(I));
+  openpgl::cpp::SetVolumeScatter(state->guiding.path_segment, true);
+  openpgl::cpp::SetScatteredContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetDirectContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, one);
+  openpgl::cpp::SetEta(state->guiding.path_segment, 1.0);
+#endif
+}
+
+/* Records the transmission of the path at the point of entry while passing
+ * the surface boundary.*/
+ccl_device_forceinline void guiding_record_bssrdf_weight(KernelGlobals kg,
+                                                         IntegratorState state,
+                                                         const Spectrum weight,
+                                                         const Spectrum albedo)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+
+  /* Note albedo left out here, will be included in guiding_record_bssrdf_bounce. */
+  const float3 weight_rgb = spectrum_to_rgb(safe_divide_color(weight, albedo));
+
+  kernel_assert(state->guiding.path_segment != nullptr);
+
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, guiding_vec3f(zero_float3()));
+  openpgl::cpp::SetScatteringWeight(state->guiding.path_segment, guiding_vec3f(weight_rgb));
+  openpgl::cpp::SetIsDelta(state->guiding.path_segment, false);
+  openpgl::cpp::SetEta(state->guiding.path_segment, 1.0f);
+  openpgl::cpp::SetRoughness(state->guiding.path_segment, 1.0f);
+#endif
+}
+
+/* Records the direction at the point of entry the path takes when sampling the SSS contribution.
+ * If not terminated this function is usually followed by a call of
+ * guiding_record_volume_transmission to record the transmittance between the point of entry and
+ * the point of exit.*/
+ccl_device_forceinline void guiding_record_bssrdf_bounce(KernelGlobals kg,
+                                                         IntegratorState state,
+                                                         const float pdf,
+                                                         const float3 N,
+                                                         const float3 omega_in,
+                                                         const Spectrum weight,
+                                                         const Spectrum albedo)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  const float3 normal = clamp(N, -one_float3(), one_float3());
+  const float3 weight_rgb = spectrum_to_rgb(weight * albedo);
+
+  kernel_assert(state->guiding.path_segment != nullptr);
+
+  openpgl::cpp::SetVolumeScatter(state->guiding.path_segment, false);
+  openpgl::cpp::SetNormal(state->guiding.path_segment, guiding_vec3f(normal));
+  openpgl::cpp::SetDirectionIn(state->guiding.path_segment, guiding_vec3f(omega_in));
+  openpgl::cpp::SetPDFDirectionIn(state->guiding.path_segment, pdf);
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, guiding_vec3f(weight_rgb));
+#endif
+}
+
+/* Record Volume Interactions */
+
+/* Records/Adds a new path segment with the current path vertex being inside a volume.
+ * If the path is not terminated this call is usually followed by a call of
+ * guiding_record_volume_bounce. */
+ccl_device_forceinline void guiding_record_volume_segment(KernelGlobals kg,
+                                                          IntegratorState state,
+                                                          const float3 P,
+                                                          const float3 I)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  const pgl_vec3f zero = guiding_vec3f(zero_float3());
+  const pgl_vec3f one = guiding_vec3f(one_float3());
+
+  state->guiding.path_segment = kg->opgl_path_segment_storage->NextSegment();
+
+  openpgl::cpp::SetPosition(state->guiding.path_segment, guiding_point3f(P));
+  openpgl::cpp::SetDirectionOut(state->guiding.path_segment, guiding_vec3f(I));
+  openpgl::cpp::SetVolumeScatter(state->guiding.path_segment, true);
+  openpgl::cpp::SetScatteredContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetDirectContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, one);
+  openpgl::cpp::SetEta(state->guiding.path_segment, 1.0);
+#endif
+}
+
+/* Records the volume scattering event at the current vertex position of the segment.*/
+ccl_device_forceinline void guiding_record_volume_bounce(KernelGlobals kg,
+                                                         IntegratorState state,
+                                                         ccl_private const ShaderData *sd,
+                                                         const Spectrum weight,
+                                                         const float pdf,
+                                                         const float3 omega_in,
+                                                         const float roughness)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  const float3 weight_rgb = spectrum_to_rgb(weight);
+  const float3 normal = make_float3(0.0f, 0.0f, 1.0f);
+
+  kernel_assert(state->guiding.path_segment != nullptr);
+
+  openpgl::cpp::SetVolumeScatter(state->guiding.path_segment, true);
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, guiding_vec3f(one_float3()));
+  openpgl::cpp::SetNormal(state->guiding.path_segment, guiding_vec3f(normal));
+  openpgl::cpp::SetDirectionIn(state->guiding.path_segment, guiding_vec3f(omega_in));
+  openpgl::cpp::SetPDFDirectionIn(state->guiding.path_segment, pdf);
+  openpgl::cpp::SetScatteringWeight(state->guiding.path_segment, guiding_vec3f(weight_rgb));
+  openpgl::cpp::SetIsDelta(state->guiding.path_segment, false);
+  openpgl::cpp::SetEta(state->guiding.path_segment, 1.f);
+  openpgl::cpp::SetRoughness(state->guiding.path_segment, roughness);
+#endif
+}
+
+/* Records the transmission (a.k.a. transmittance weight) between the current path segment
+ * and the next one, when the path is inside or passes a volume.*/
+ccl_device_forceinline void guiding_record_volume_transmission(KernelGlobals kg,
+                                                               IntegratorState state,
+                                                               const float3 transmittance_weight)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+
+  if (state->guiding.path_segment) {
+    // TODO (sherholz): need to find a better way to avoid this check
+    if ((transmittance_weight[0] < 0.f || !std::isfinite(transmittance_weight[0]) ||
+         std::isnan(transmittance_weight[0])) ||
+        (transmittance_weight[1] < 0.f || !std::isfinite(transmittance_weight[1]) ||
+         std::isnan(transmittance_weight[1])) ||
+        (transmittance_weight[2] < 0.f || !std::isfinite(transmittance_weight[2]) ||
+         std::isnan(transmittance_weight[2]))) {
+    }
+    else {
+      openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment,
+                                           guiding_vec3f(transmittance_weight));
+    }
+  }
+#endif
+}
+
+/* Records the emission of a volume at the vertex of the current path segment. */
+ccl_device_forceinline void guiding_record_volume_emission(KernelGlobals kg,
+                                                           IntegratorState state,
+                                                           const Spectrum Le)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+
+  if (state->guiding.path_segment) {
+    const float3 Le_rgb = spectrum_to_rgb(Le);
+
+    openpgl::cpp::SetDirectContribution(state->guiding.path_segment, guiding_vec3f(Le_rgb));
+    openpgl::cpp::SetMiWeight(state->guiding.path_segment, 1.0f);
+  }
+#endif
+}
+
+/* Record Light Interactions */
+
+/* Adds a pseudo path vertex/segment when intersecting a virtual light source.
+ * (e.g., area, sphere, or disk light). This call is often followed
+ * a call of guiding_record_surface_emission, if the intersected light source
+ * emits light in the direction of the path. */
+ccl_device_forceinline void guiding_record_light_surface_segment(
+    KernelGlobals kg, IntegratorState state, ccl_private const Intersection *ccl_restrict isect)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  const pgl_vec3f zero = guiding_vec3f(zero_float3());
+  const pgl_vec3f one = guiding_vec3f(one_float3());
+  const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
+  const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
+  const float3 P = ray_P + isect->t * ray_D;
+
+  state->guiding.path_segment = kg->opgl_path_segment_storage->NextSegment();
+  openpgl::cpp::SetPosition(state->guiding.path_segment, guiding_point3f(P));
+  openpgl::cpp::SetDirectionOut(state->guiding.path_segment, guiding_vec3f(-ray_D));
+  openpgl::cpp::SetNormal(state->guiding.path_segment, guiding_vec3f(-ray_D));
+  openpgl::cpp::SetDirectionIn(state->guiding.path_segment, guiding_vec3f(ray_D));
+  openpgl::cpp::SetPDFDirectionIn(state->guiding.path_segment, 1.0f);
+  openpgl::cpp::SetVolumeScatter(state->guiding.path_segment, false);
+  openpgl::cpp::SetScatteredContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetDirectContribution(state->guiding.path_segment, zero);
+  openpgl::cpp::SetTransmittanceWeight(state->guiding.path_segment, one);
+  openpgl::cpp::SetScatteringWeight(state->guiding.path_segment, one);
+  openpgl::cpp::SetEta(state->guiding.path_segment, 1.0f);
+#endif
+}
+
+/* Records/Adds a final path segment when the path leaves the scene and
+ * intersects with a background light (e.g., background color,
+ * distant light, or env map). The vertex for this segment is placed along
+ * the current ray far out the scene.*/
+ccl_device_forceinline void guiding_record_background(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      const Spectrum L,
+                                                      const float mis_weight)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+
+  const float3 L_rgb = spectrum_to_rgb(L);
+  const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
+  const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
+  const float3 P = ray_P + (1e6f) * ray_D;
+  const float3 normal = make_float3(0.0f, 0.0f, 1.0f);
+
+  openpgl::cpp::PathSegment background_segment;
+  openpgl::cpp::SetPosition(&background_segment, guiding_vec3f(P));
+  openpgl::cpp::SetNormal(&background_segment, guiding_vec3f(normal));
+  openpgl::cpp::SetDirectionOut(&background_segment, guiding_vec3f(-ray_D));
+  openpgl::cpp::SetDirectContribution(&background_segment, guiding_vec3f(L_rgb));
+  openpgl::cpp::SetMiWeight(&background_segment, mis_weight);
+  kg->opgl_path_segment_storage->AddSegment(background_segment);
+#endif
+}
+
+/* Records the scattered contribution of a next event estimation
+ * (i.e., a direct light estimate scattered at the current path vertex
+ * towards the previous vertex).*/
+ccl_device_forceinline void guiding_record_direct_light(KernelGlobals kg,
+                                                        IntegratorShadowState state)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+  if (state->shadow_path.path_segment) {
+    const Spectrum Lo = safe_divide_color(INTEGRATOR_STATE(state, shadow_path, throughput),
+                                          INTEGRATOR_STATE(state, shadow_path, unlit_throughput));
+
+    const float3 Lo_rgb = spectrum_to_rgb(Lo);
+    openpgl::cpp::AddScatteredContribution(state->shadow_path.path_segment, guiding_vec3f(Lo_rgb));
+  }
+#endif
+}
+
+/* Record Russian Roulette */
+/* Records the probability of continuing the path at the current path segment. */
+ccl_device_forceinline void guiding_record_continuation_probability(
+    KernelGlobals kg, IntegratorState state, const float continuation_probability)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+
+  if (state->guiding.path_segment) {
+    openpgl::cpp::SetRussianRouletteProbability(state->guiding.path_segment,
+                                                continuation_probability);
+  }
+#endif
+}
+
+/* Path guiding debug render passes. */
+
+/* Write a set of path guiding related debug information (e.g., guiding probability at first
+ * bounce) into separate rendering passes.*/
+ccl_device_forceinline void guiding_write_debug_passes(KernelGlobals kg,
+                                                       IntegratorState state,
+                                                       ccl_private const ShaderData *sd,
+                                                       ccl_global float *ccl_restrict
+                                                           render_buffer)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+#  ifdef WITH_CYCLES_DEBUG
+  if (!kernel_data.integrator.train_guiding) {
+    return;
+  }
+
+  if (INTEGRATOR_STATE(state, path, bounce) != 0) {
+    return;
+  }
+
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
+
+  if (kernel_data.film.pass_guiding_probability != PASS_UNUSED) {
+    float guiding_prob = state->guiding.surface_guiding_sampling_prob;
+    film_write_pass_float(buffer + kernel_data.film.pass_guiding_probability, guiding_prob);
+  }
+
+  if (kernel_data.film.pass_guiding_avg_roughness != PASS_UNUSED) {
+    float avg_roughness = 0.0f;
+    float sum_sample_weight = 0.0f;
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        continue;
+      }
+      avg_roughness += sc->sample_weight * bsdf_get_specular_roughness_squared(sc);
+      sum_sample_weight += sc->sample_weight;
+    }
+
+    avg_roughness = avg_roughness > 0.f ? avg_roughness / sum_sample_weight : 0.f;
+
+    film_write_pass_float(buffer + kernel_data.film.pass_guiding_avg_roughness, avg_roughness);
+  }
+#  endif
+#endif
+}
+
+/* Guided BSDFs */
+
+ccl_device_forceinline bool guiding_bsdf_init(KernelGlobals kg,
+                                              IntegratorState state,
+                                              const float3 P,
+                                              const float3 N,
+                                              ccl_private float &rand)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  if (kg->opgl_surface_sampling_distribution->Init(
+          kg->opgl_guiding_field, guiding_point3f(P), rand, true)) {
+    kg->opgl_surface_sampling_distribution->ApplyCosineProduct(guiding_point3f(N));
+    return true;
+  }
+#endif
+
+  return false;
+}
+
+ccl_device_forceinline float guiding_bsdf_sample(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const float2 rand_bsdf,
+                                                 ccl_private float3 *omega_in)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  pgl_vec3f wo;
+  const pgl_point2f rand = openpgl::cpp::Point2(rand_bsdf.x, rand_bsdf.y);
+  const float pdf = kg->opgl_surface_sampling_distribution->SamplePDF(rand, wo);
+  *omega_in = make_float3(wo.x, wo.y, wo.z);
+  return pdf;
+#else
+  return 0.0f;
+#endif
+}
+
+ccl_device_forceinline float guiding_bsdf_pdf(KernelGlobals kg,
+                                              IntegratorState state,
+                                              const float3 omega_in)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  return kg->opgl_surface_sampling_distribution->PDF(guiding_vec3f(omega_in));
+#else
+  return 0.0f;
+#endif
+}
+
+/* Guided Volume Phases */
+
+ccl_device_forceinline bool guiding_phase_init(KernelGlobals kg,
+                                               IntegratorState state,
+                                               const float3 P,
+                                               const float3 D,
+                                               const float g,
+                                               ccl_private float &rand)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  /* we do not need to guide almost delta phase functions */
+  if (fabsf(g) >= 0.99f) {
+    return false;
+  }
+
+  if (kg->opgl_volume_sampling_distribution->Init(
+          kg->opgl_guiding_field, guiding_point3f(P), rand, true)) {
+    kg->opgl_volume_sampling_distribution->ApplySingleLobeHenyeyGreensteinProduct(guiding_vec3f(D),
+                                                                                  g);
+    return true;
+  }
+#endif
+
+  return false;
+}
+
+ccl_device_forceinline float guiding_phase_sample(KernelGlobals kg,
+                                                  IntegratorState state,
+                                                  const float2 rand_phase,
+                                                  ccl_private float3 *omega_in)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  pgl_vec3f wo;
+  const pgl_point2f rand = openpgl::cpp::Point2(rand_phase.x, rand_phase.y);
+  const float pdf = kg->opgl_volume_sampling_distribution->SamplePDF(rand, wo);
+  *omega_in = make_float3(wo.x, wo.y, wo.z);
+  return pdf;
+#else
+  return 0.0f;
+#endif
+}
+
+ccl_device_forceinline float guiding_phase_pdf(KernelGlobals kg,
+                                               IntegratorState state,
+                                               const float3 omega_in)
+{
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  return kg->opgl_volume_sampling_distribution->PDF(guiding_vec3f(omega_in));
+#else
+  return 0.0f;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h
index bf3f41b52b9..667ba949760 100644
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -5,8 +5,8 @@
 
 #include "kernel/camera/camera.h"
 
-#include "kernel/film/accumulate.h"
 #include "kernel/film/adaptive_sampling.h"
+#include "kernel/film/light_passes.h"
 
 #include "kernel/integrator/path_state.h"
 
@@ -92,12 +92,12 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   path_state_init(state, tile, x, y);
 
   /* Check whether the pixel has converged and should not be sampled anymore. */
-  if (!kernel_need_sample_pixel(kg, state, render_buffer)) {
+  if (!film_need_sample_pixel(kg, state, render_buffer)) {
     return false;
   }
 
   /* Always count the sample, even if the camera sample will reject the ray. */
-  const int sample = kernel_accum_sample(
+  const int sample = film_write_sample(
       kg, state, render_buffer, scheduled_sample, tile->sample_offset);
 
   /* Setup render buffers. */
@@ -112,8 +112,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   int prim = __float_as_uint(primitive[1]);
   if (prim == -1) {
     /* Accumulate transparency for empty pixels. */
-    kernel_accum_transparent(kg, state, 0, 1.0f, buffer);
-    return false;
+    film_write_transparent(kg, state, 0, 1.0f, buffer);
+    return true;
   }
 
   prim += kernel_data.bake.tri_offset;
@@ -121,13 +121,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   /* Random number generator. */
   const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
 
-  float filter_x, filter_y;
-  if (sample == 0) {
-    filter_x = filter_y = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y);
-  }
+  const float2 rand_filter = (sample == 0) ? make_float2(0.5f, 0.5f) :
+                                             path_rng_2D(kg, rng_hash, sample, PRNG_FILTER);
 
   /* Initialize path state for path integration. */
   path_state_init_integrator(kg, state, sample, rng_hash);
@@ -150,11 +145,17 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
 
   /* Sub-pixel offset. */
   if (sample > 0) {
-    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
-    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
+    u = bake_clamp_mirror_repeat(u + dudx * (rand_filter.x - 0.5f) + dudy * (rand_filter.y - 0.5f),
+                                 1.0f);
+    v = bake_clamp_mirror_repeat(v + dvdx * (rand_filter.x - 0.5f) + dvdy * (rand_filter.y - 0.5f),
                                  1.0f - u);
   }
 
+  /* Convert from Blender to Cycles/Embree/OptiX barycentric convention. */
+  const float tmp = u;
+  u = v;
+  v = 1.0f - tmp - v;
+
   /* Position and normal on triangle. */
   const int object = kernel_data.bake.object_index;
   float3 P, Ng;
@@ -199,18 +200,61 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
 
     /* Fast path for position and normal passes not affected by shaders. */
     if (kernel_data.film.pass_position != PASS_UNUSED) {
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_position, P);
+      film_write_pass_float3(buffer + kernel_data.film.pass_position, P);
       return true;
     }
     else if (kernel_data.film.pass_normal != PASS_UNUSED && !(shader_flags & SD_HAS_BUMP)) {
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, N);
+      film_write_pass_float3(buffer + kernel_data.film.pass_normal, N);
       return true;
     }
 
     /* Setup ray. */
     Ray ray ccl_optional_struct_init;
-    ray.P = P + N;
-    ray.D = -N;
+
+    if (kernel_data.bake.use_camera) {
+      float3 D = camera_direction_from_point(kg, P);
+
+      const float DN = dot(D, N);
+
+      /* Nudge camera direction, so that the faces facing away from the camera still have
+       * somewhat usable shading. (Otherwise, glossy faces would be simply black.)
+       *
+       * The surface normal offset affects smooth surfaces. Lower values will make
+       * smooth surfaces more faceted, but higher values may show up from the camera
+       * at grazing angles.
+       *
+       * This value can actually be pretty high before it's noticeably wrong. */
+      const float surface_normal_offset = 0.2f;
+
+      /* Keep the ray direction at least `surface_normal_offset` "above" the smooth normal. */
+      if (DN <= surface_normal_offset) {
+        D -= N * (DN - surface_normal_offset);
+        D = normalize(D);
+      }
+
+      /* On the backside, just lerp towards the surface normal for the ray direction,
+       * as DN goes from 0.0 to -1.0. */
+      if (DN <= 0.0f) {
+        D = normalize(mix(D, N, -DN));
+      }
+
+      /* We don't want to bake the back face, so make sure the ray direction never
+       * goes behind the geometry (flat) normal. This is a fail-safe, and should rarely happen. */
+      const float true_normal_epsilon = 0.00001f;
+
+      if (dot(D, Ng) <= true_normal_epsilon) {
+        D -= Ng * (dot(D, Ng) - true_normal_epsilon);
+        D = normalize(D);
+      }
+
+      ray.P = P + D;
+      ray.D = -D;
+    }
+    else {
+      ray.P = P + N;
+      ray.D = -N;
+    }
+
     ray.tmin = 0.0f;
     ray.tmax = FLT_MAX;
     ray.time = 0.5f;
diff --git a/intern/cycles/kernel/integrator/init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h
index e89ab3991c7..8df3e1b9fb3 100644
--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -5,8 +5,8 @@
 
 #include "kernel/camera/camera.h"
 
-#include "kernel/film/accumulate.h"
 #include "kernel/film/adaptive_sampling.h"
+#include "kernel/film/light_passes.h"
 
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/shadow_catcher.h"
@@ -23,31 +23,21 @@ ccl_device_inline void integrate_camera_sample(KernelGlobals kg,
                                                ccl_private Ray *ray)
 {
   /* Filter sampling. */
-  float filter_u, filter_v;
-
-  if (sample == 0) {
-    filter_u = 0.5f;
-    filter_v = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v);
-  }
+  const float2 rand_filter = (sample == 0) ? make_float2(0.5f, 0.5f) :
+                                             path_rng_2D(kg, rng_hash, sample, PRNG_FILTER);
 
   /* Depth of field sampling. */
-  float lens_u = 0.0f, lens_v = 0.0f;
-  if (kernel_data.cam.aperturesize > 0.0f) {
-    path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v);
-  }
+  const float2 rand_lens = (kernel_data.cam.aperturesize > 0.0f) ?
+                               path_rng_2D(kg, rng_hash, sample, PRNG_LENS) :
+                               zero_float2();
 
   /* Motion blur time sampling. */
-  float time = 0.0f;
-#ifdef __CAMERA_MOTION__
-  if (kernel_data.cam.shuttertime != -1.0f)
-    time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME);
-#endif
+  const float rand_time = (kernel_data.cam.shuttertime != -1.0f) ?
+                              path_rng_1D(kg, rng_hash, sample, PRNG_TIME) :
+                              0.0f;
 
   /* Generate camera ray. */
-  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+  camera_sample(kg, x, y, rand_filter.x, rand_filter.y, rand_lens.x, rand_lens.y, rand_time, ray);
 }
 
 /* Return false to indicate that this pixel is finished.
@@ -67,7 +57,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
   path_state_init(state, tile, x, y);
 
   /* Check whether the pixel has converged and should not be sampled anymore. */
-  if (!kernel_need_sample_pixel(kg, state, render_buffer)) {
+  if (!film_need_sample_pixel(kg, state, render_buffer)) {
     return false;
   }
 
@@ -76,7 +66,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
    * This logic allows to both count actual number of samples per pixel, and to add samples to this
    * pixel after it was converged and samples were added somewhere else (in which case the
    * `scheduled_sample` will be different from actual number of samples in this pixel). */
-  const int sample = kernel_accum_sample(
+  const int sample = film_write_sample(
       kg, state, render_buffer, scheduled_sample, tile->sample_offset);
 
   /* Initialize random number seed for path. */
diff --git a/intern/cycles/kernel/integrator/intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h
index 60299f2cb2f..b9a81e25bcc 100644
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -5,13 +5,14 @@
 
 #include "kernel/camera/projection.h"
 
+#include "kernel/film/light_passes.h"
+
+#include "kernel/integrator/guiding.h"
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/shadow_catcher.h"
 
 #include "kernel/light/light.h"
 
-#include "kernel/util/differential.h"
-
 #include "kernel/geom/geom.h"
 
 #include "kernel/bvh/bvh.h"
@@ -48,13 +49,15 @@ ccl_device_forceinline bool integrator_intersect_terminate(KernelGlobals kg,
    * surfaces in front of emission do we need to evaluate the shader, since we
    * perform MIS as part of indirect rays. */
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  const float probability = path_state_continuation_probability(kg, state, path_flag);
-  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = probability;
+  const float continuation_probability = path_state_continuation_probability(kg, state, path_flag);
+  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = continuation_probability;
+
+  guiding_record_continuation_probability(kg, state, continuation_probability);
 
-  if (probability != 1.0f) {
+  if (continuation_probability != 1.0f) {
     const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
 
-    if (probability == 0.0f || terminate >= probability) {
+    if (continuation_probability == 0.0f || terminate >= continuation_probability) {
       if (shader_flags & SD_HAS_EMISSION) {
         /* Mark path to be terminated right after shader evaluation on the surface. */
         INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
@@ -87,7 +90,7 @@ ccl_device_forceinline void integrator_split_shadow_catcher(
     return;
   }
 
-  kernel_write_shadow_catcher_bounce_data(kg, state, render_buffer);
+  film_write_shadow_catcher_bounce_data(kg, state, render_buffer);
 
   /* Mark state as having done a shadow catcher split so that it stops contributing to
    * the shadow catcher matte pass, but keeps contributing to the combined pass. */
diff --git a/intern/cycles/kernel/integrator/intersect_shadow.h b/intern/cycles/kernel/integrator/intersect_shadow.h
index 1b48b360858..25ff3d5b23f 100644
--- a/intern/cycles/kernel/integrator/intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/intersect_shadow.h
@@ -51,7 +51,7 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k
 }
 
 #ifdef __TRANSPARENT_SHADOWS__
-#  if defined(__KERNEL_CPU__)
+#  ifndef __KERNEL_GPU__
 ccl_device int shadow_intersections_compare(const void *a, const void *b)
 {
   const Intersection *isect_a = (const Intersection *)a;
diff --git a/intern/cycles/kernel/integrator/intersect_volume_stack.h b/intern/cycles/kernel/integrator/intersect_volume_stack.h
index 9ba4a0a3964..c2490581e4d 100644
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -5,7 +5,6 @@
 
 #include "kernel/bvh/bvh.h"
 #include "kernel/geom/geom.h"
-#include "kernel/integrator/shader_eval.h"
 #include "kernel/integrator/volume_stack.h"
 
 CCL_NAMESPACE_BEGIN
@@ -38,8 +37,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 
 #ifdef __VOLUME_RECORD_ALL__
   Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+  uint num_hits = scene_intersect_volume(kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
   if (num_hits > 0) {
     Intersection *isect = hits;
 
@@ -108,8 +106,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
 
 #ifdef __VOLUME_RECORD_ALL__
   Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+  uint num_hits = scene_intersect_volume(kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
   if (num_hits > 0) {
     int enclosed_volumes[MAX_VOLUME_STACK_SIZE];
     Intersection *isect = hits;
diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h
index f5d2bcfe9f2..142977f1ac7 100644
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -186,7 +186,7 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
     triangle_vertices_and_normals(kg, sd_vtx->prim, verts, normals);
 
     /* Compute refined position (same code as in triangle_point_from_uv). */
-    sd_vtx->P = isect->u * verts[0] + isect->v * verts[1] + (1.f - isect->u - isect->v) * verts[2];
+    sd_vtx->P = (1.f - isect->u - isect->v) * verts[0] + isect->u * verts[1] + isect->v * verts[2];
     if (!(sd_vtx->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
       const Transform tfm = object_get_transform(kg, sd_vtx);
       sd_vtx->P = transform_point(&tfm, sd_vtx->P);
@@ -213,8 +213,8 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
   }
 
   /* Tangent space (position derivatives) WRT barycentric (u, v). */
-  float3 dp_du = verts[0] - verts[2];
-  float3 dp_dv = verts[1] - verts[2];
+  float3 dp_du = verts[1] - verts[0];
+  float3 dp_dv = verts[2] - verts[0];
 
   /* Geometric normal. */
   vtx->ng = normalize(cross(dp_du, dp_dv));
@@ -223,16 +223,16 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
 
   /* Shading normals: Interpolate normals between vertices. */
   float n_len;
-  vtx->n = normalize_len(normals[0] * sd_vtx->u + normals[1] * sd_vtx->v +
-                             normals[2] * (1.0f - sd_vtx->u - sd_vtx->v),
+  vtx->n = normalize_len(normals[0] * (1.0f - sd_vtx->u - sd_vtx->v) + normals[1] * sd_vtx->u +
+                             normals[2] * sd_vtx->v,
                          &n_len);
 
   /* Shading normal derivatives WRT barycentric (u, v)
    * we calculate the derivative of n = |u*n0 + v*n1 + (1-u-v)*n2| using:
    * d/du [f(u)/|f(u)|] = [d/du f(u)]/|f(u)| - f(u)/|f(u)|^3 <f(u), d/du f(u)>. */
   const float inv_n_len = 1.f / n_len;
-  float3 dn_du = inv_n_len * (normals[0] - normals[2]);
-  float3 dn_dv = inv_n_len * (normals[1] - normals[2]);
+  float3 dn_du = inv_n_len * (normals[1] - normals[0]);
+  float3 dn_dv = inv_n_len * (normals[2] - normals[0]);
   dn_du -= vtx->n * dot(vtx->n, dn_du);
   dn_dv -= vtx->n * dot(vtx->n, dn_dv);
 
@@ -279,7 +279,15 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
 }
 
 /* Compute constraint derivatives. */
-ccl_device_forceinline bool mnee_compute_constraint_derivatives(
+
+#  if defined(__KERNEL_METAL__)
+/* Temporary workaround for front-end compilation bug (incorrect MNEE rendering when this is
+ * inlined). */
+__attribute__((noinline))
+#  else
+ccl_device_forceinline
+#  endif
+bool mnee_compute_constraint_derivatives(
     int vertex_count,
     ccl_private ManifoldVertex *vertices,
     ccl_private const float3 &surface_sample_pos,
@@ -392,7 +400,7 @@ ccl_device_forceinline bool mnee_compute_constraint_derivatives(
 /* Invert (block) constraint derivative matrix and solve linear system so we can map dh back to dx:
  *  dh / dx = A
  *  dx = inverse(A) x dh
- *  to use for specular specular manifold walk
+ *  to use for specular manifold walk
  * (See for example http://faculty.washington.edu/finlayso/ebook/algebraic/advanced/LUtri.htm
  *  for block tridiagonal matrix based linear system solve) */
 ccl_device_forceinline bool mnee_solve_matrix_h_to_x(int vertex_count,
@@ -634,9 +642,9 @@ mnee_sample_bsdf_dh(ClosureType type, float alpha_x, float alpha_y, float sample
  * We assume here that the pdf (in half-vector measure) is the same as
  * the one calculation when sampling the microfacet normals from the
  * specular chain above: this allows us to simplify the bsdf weight */
-ccl_device_forceinline float3 mnee_eval_bsdf_contribution(ccl_private ShaderClosure *closure,
-                                                          float3 wi,
-                                                          float3 wo)
+ccl_device_forceinline Spectrum mnee_eval_bsdf_contribution(ccl_private ShaderClosure *closure,
+                                                            float3 wi,
+                                                            float3 wo)
 {
   ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)closure;
 
@@ -808,7 +816,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
   float3 wo = normalize_len(vertices[0].p - sd->P, &wo_len);
 
   /* Initialize throughput and evaluate receiver bsdf * |n.wo|. */
-  shader_bsdf_eval(kg, sd, wo, false, throughput, ls->shader);
+  surface_shader_bsdf_eval(kg, state, sd, wo, throughput, ls->shader);
 
   /* Update light sample with new position / direct.ion
    * and keep pdf in vertex area measure */
@@ -836,7 +844,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
                                                              1;
   INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + vertex_count;
 
-  float3 light_eval = light_sample_shader_eval(kg, state, sd_mnee, ls, sd->time);
+  Spectrum light_eval = light_sample_shader_eval(kg, state, sd_mnee, ls, sd->time);
   bsdf_eval_mul(throughput, light_eval / ls->pdf);
 
   /* Generalized geometry term. */
@@ -914,7 +922,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
     INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + 1 + vi;
 
     /* Evaluate shader nodes at solution vi. */
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+    surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
         kg, state, sd_mnee, NULL, PATH_RAY_DIFFUSE, true);
 
     /* Set light looking dir. */
@@ -925,7 +933,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
     /* Evaluate product term inside eq.6 at solution interface. vi
      * divided by corresponding sampled pdf:
      * fr(vi)_do / pdf_dh(vi) x |do/dh| x |n.wo / n.h| */
-    float3 bsdf_contribution = mnee_eval_bsdf_contribution(v.bsdf, wi, wo);
+    Spectrum bsdf_contribution = mnee_eval_bsdf_contribution(v.bsdf, wi, wo);
     bsdf_eval_mul(throughput, bsdf_contribution);
   }
 
@@ -1007,7 +1015,7 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
         return 0;
 
       /* Last bool argument is the MNEE flag (for TINY_MAX_CLOSURE cap in kernel_shader.h). */
-      shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+      surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
           kg, state, sd_mnee, NULL, PATH_RAY_DIFFUSE, true);
 
       /* Get and sample refraction bsdf */
@@ -1034,10 +1042,12 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
           float2 h = zero_float2();
           if (microfacet_bsdf->alpha_x > 0.f && microfacet_bsdf->alpha_y > 0.f) {
             /* Sample transmissive microfacet bsdf. */
-            float bsdf_u, bsdf_v;
-            path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-            h = mnee_sample_bsdf_dh(
-                bsdf->type, microfacet_bsdf->alpha_x, microfacet_bsdf->alpha_y, bsdf_u, bsdf_v);
+            const float2 bsdf_uv = path_state_rng_2D(kg, rng_state, PRNG_SURFACE_BSDF);
+            h = mnee_sample_bsdf_dh(bsdf->type,
+                                    microfacet_bsdf->alpha_x,
+                                    microfacet_bsdf->alpha_y,
+                                    bsdf_uv.x,
+                                    bsdf_uv.y);
           }
 
           /* Setup differential geometry on vertex. */
diff --git a/intern/cycles/kernel/integrator/path_state.h b/intern/cycles/kernel/integrator/path_state.h
index 912c380cdb6..7197f0f2f3a 100644
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -13,7 +13,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init_queues(IntegratorState state)
 {
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
   INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
   INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
 #endif
@@ -48,13 +48,25 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, path, volume_bounce) = 0;
   INTEGRATOR_STATE_WRITE(state, path, volume_bounds_bounce) = 0;
   INTEGRATOR_STATE_WRITE(state, path, rng_hash) = rng_hash;
-  INTEGRATOR_STATE_WRITE(state, path, rng_offset) = PRNG_BASE_NUM;
+  INTEGRATOR_STATE_WRITE(state, path, rng_offset) = PRNG_BOUNCE_NUM;
   INTEGRATOR_STATE_WRITE(state, path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
                                               PATH_RAY_TRANSPARENT_BACKGROUND;
   INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = 0.0f;
   INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = FLT_MAX;
   INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f;
-  INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+  INTEGRATOR_STATE_WRITE(state, path, throughput) = one_spectrum();
+
+#ifdef __PATH_GUIDING__
+  INTEGRATOR_STATE_WRITE(state, path, unguided_throughput) = 1.0f;
+  INTEGRATOR_STATE_WRITE(state, guiding, path_segment) = nullptr;
+  INTEGRATOR_STATE_WRITE(state, guiding, use_surface_guiding) = false;
+  INTEGRATOR_STATE_WRITE(state, guiding, sample_surface_guiding_rand) = 0.5f;
+  INTEGRATOR_STATE_WRITE(state, guiding, surface_guiding_sampling_prob) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, guiding, bssrdf_sampling_prob) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, guiding, use_volume_guiding) = false;
+  INTEGRATOR_STATE_WRITE(state, guiding, sample_volume_guiding_rand) = 0.5f;
+  INTEGRATOR_STATE_WRITE(state, guiding, volume_guiding_sampling_prob) = 0.0f;
+#endif
 
 #ifdef __MNEE__
   INTEGRATOR_STATE_WRITE(state, path, mnee) = 0;
@@ -74,7 +86,7 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
 #ifdef __DENOISING_FEATURES__
   if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
     INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_DENOISING_FEATURES;
-    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) = one_float3();
+    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) = one_spectrum();
   }
 #endif
 }
@@ -249,7 +261,11 @@ ccl_device_inline float path_state_continuation_probability(KernelGlobals kg,
 
   /* Probabilistic termination: use sqrt() to roughly match typical view
    * transform and do path termination a bit later on average. */
-  return min(sqrtf(reduce_max(fabs(INTEGRATOR_STATE(state, path, throughput)))), 1.0f);
+  Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  throughput *= INTEGRATOR_STATE(state, path, unguided_throughput);
+#endif
+  return min(sqrtf(reduce_max(fabs(throughput))), 1.0f);
 }
 
 ccl_device_inline bool path_state_ao_bounce(KernelGlobals kg, ConstIntegratorState state)
@@ -298,38 +314,25 @@ ccl_device_inline void shadow_path_state_rng_load(ConstIntegratorShadowState sta
 
 ccl_device_inline float path_state_rng_1D(KernelGlobals kg,
                                           ccl_private const RNGState *rng_state,
-                                          int dimension)
+                                          const int dimension)
 {
   return path_rng_1D(
       kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals kg,
-                                         ccl_private const RNGState *rng_state,
-                                         int dimension,
-                                         ccl_private float *fx,
-                                         ccl_private float *fy)
-{
-  path_rng_2D(
-      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
-}
-
-ccl_device_inline float path_state_rng_1D_hash(KernelGlobals kg,
-                                               ccl_private const RNGState *rng_state,
-                                               uint hash)
+ccl_device_inline float2 path_state_rng_2D(KernelGlobals kg,
+                                           ccl_private const RNGState *rng_state,
+                                           const int dimension)
 {
-  /* Use a hash instead of dimension, this is not great but avoids adding
-   * more dimensions to each bounce which reduces quality of dimensions we
-   * are already using. */
-  return path_rng_1D(
-      kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+  return path_rng_2D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
 }
 
 ccl_device_inline float path_branched_rng_1D(KernelGlobals kg,
                                              ccl_private const RNGState *rng_state,
-                                             int branch,
-                                             int num_branches,
-                                             int dimension)
+                                             const int branch,
+                                             const int num_branches,
+                                             const int dimension)
 {
   return path_rng_1D(kg,
                      rng_state->rng_hash,
@@ -337,20 +340,16 @@ ccl_device_inline float path_branched_rng_1D(KernelGlobals kg,
                      rng_state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals kg,
-                                            ccl_private const RNGState *rng_state,
-                                            int branch,
-                                            int num_branches,
-                                            int dimension,
-                                            ccl_private float *fx,
-                                            ccl_private float *fy)
+ccl_device_inline float2 path_branched_rng_2D(KernelGlobals kg,
+                                              ccl_private const RNGState *rng_state,
+                                              const int branch,
+                                              const int num_branches,
+                                              const int dimension)
 {
-  path_rng_2D(kg,
-              rng_state->rng_hash,
-              rng_state->sample * num_branches + branch,
-              rng_state->rng_offset + dimension,
-              fx,
-              fy);
+  return path_rng_2D(kg,
+                     rng_state->rng_hash,
+                     rng_state->sample * num_branches + branch,
+                     rng_state->rng_offset + dimension);
 }
 
 /* Utility functions to get light termination value,
diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h
index a7edfffd175..8fc5689683a 100644
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -3,18 +3,20 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/film/light_passes.h"
+
+#include "kernel/integrator/guiding.h"
+#include "kernel/integrator/surface_shader.h"
+
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
-                                                    IntegratorState state,
-                                                    ccl_global float *ccl_restrict render_buffer)
+ccl_device Spectrum integrator_eval_background_shader(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __BACKGROUND__
   const int shader = kernel_data.background.surface_shader;
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
 
@@ -26,55 +28,35 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
         ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
         ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
         ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
-      return zero_float3();
+      return zero_spectrum();
   }
 
   /* Use fast constant background color if available. */
-  float3 L = zero_float3();
-  if (!shader_constant_emission_eval(kg, shader, &L)) {
-    /* Evaluate background shader. */
-
-    /* TODO: does aliasing like this break automatic SoA in CUDA?
-     * Should we instead store closures separate from ShaderData? */
-    ShaderDataTinyStorage emission_sd_storage;
-    ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-
-    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
-    shader_setup_from_background(kg,
-                                 emission_sd,
-                                 INTEGRATOR_STATE(state, ray, P),
-                                 INTEGRATOR_STATE(state, ray, D),
-                                 INTEGRATOR_STATE(state, ray, time));
-
-    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
-    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND>(
-        kg, state, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
-
-    L = shader_background_eval(emission_sd);
+  Spectrum L = zero_spectrum();
+  if (surface_shader_constant_emission(kg, shader, &L)) {
+    return L;
   }
 
-  /* Background MIS weights. */
-#  ifdef __BACKGROUND_MIS__
-  /* Check if background light exists or if we should skip pdf. */
-  if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_MIS_SKIP) &&
-      kernel_data.background.use_mis) {
-    const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
-    const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
-    const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-
-    /* multiple importance sampling, get background light pdf for ray
-     * direction, and compute weight with respect to BSDF pdf */
-    const float pdf = background_light_pdf(kg, ray_P, ray_D);
-    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
-    L *= mis_weight;
-  }
-#  endif
+  /* Evaluate background shader. */
 
-  return L;
-#else
-  return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+  /* TODO: does aliasing like this break automatic SoA in CUDA?
+   * Should we instead store closures separate from ShaderData? */
+  ShaderDataTinyStorage emission_sd_storage;
+  ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+
+  PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
+  shader_setup_from_background(kg,
+                               emission_sd,
+                               INTEGRATOR_STATE(state, ray, P),
+                               INTEGRATOR_STATE(state, ray, D),
+                               INTEGRATOR_STATE(state, ray, time));
+
+  PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+  PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+  surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND>(
+      kg, state, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
+
+  return surface_shader_background(emission_sd);
 }
 
 ccl_device_inline void integrate_background(KernelGlobals kg,
@@ -117,17 +99,38 @@ ccl_device_inline void integrate_background(KernelGlobals kg,
 #endif /* __MNEE__ */
 
   /* Evaluate background shader. */
-  float3 L = (eval_background) ? integrator_eval_background_shader(kg, state, render_buffer) :
-                                 zero_float3();
+  Spectrum L = zero_spectrum();
+
+  if (eval_background) {
+    L = integrator_eval_background_shader(kg, state, render_buffer);
+
+    /* When using the ao bounces approximation, adjust background
+     * shader intensity with ao factor. */
+    if (path_state_ao_bounce(kg, state)) {
+      L *= kernel_data.integrator.ao_bounces_factor;
+    }
+
+    /* Background MIS weights. */
+    float mis_weight = 1.0f;
+    /* Check if background light exists or if we should skip pdf. */
+    if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_MIS_SKIP) &&
+        kernel_data.background.use_mis) {
+      const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
+      const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
+      const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
+
+      /* multiple importance sampling, get background light pdf for ray
+       * direction, and compute weight with respect to BSDF pdf */
+      const float pdf = background_light_pdf(kg, ray_P, ray_D);
+      mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
+    }
 
-  /* When using the ao bounces approximation, adjust background
-   * shader intensity with ao factor. */
-  if (path_state_ao_bounce(kg, state)) {
-    L *= kernel_data.integrator.ao_bounces_factor;
+    guiding_record_background(kg, state, L, mis_weight);
+    L *= mis_weight;
   }
 
   /* Write to render buffer. */
-  kernel_accum_background(kg, state, L, transparent, is_transparent_background_ray, render_buffer);
+  film_write_background(kg, state, L, transparent, is_transparent_background_ray, render_buffer);
 }
 
 ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
@@ -169,24 +172,24 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
       /* TODO: does aliasing like this break automatic SoA in CUDA? */
       ShaderDataTinyStorage emission_sd_storage;
       ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-      float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
+      Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
       if (is_zero(light_eval)) {
         return;
       }
 
       /* MIS weighting. */
+      float mis_weight = 1.0f;
       if (!(path_flag & PATH_RAY_MIS_SKIP)) {
         /* multiple importance sampling, get regular light pdf,
          * and compute weight with respect to BSDF pdf */
         const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-        const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
-        light_eval *= mis_weight;
+        mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
       }
 
       /* Write to render buffer. */
-      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_accum_emission(
-          kg, state, throughput * light_eval, render_buffer, kernel_data.background.lightgroup);
+      guiding_record_background(kg, state, light_eval, mis_weight);
+      film_write_surface_emission(
+          kg, state, light_eval, mis_weight, render_buffer, kernel_data.background.lightgroup);
     }
   }
 }
diff --git a/intern/cycles/kernel/integrator/shade_light.h b/intern/cycles/kernel/integrator/shade_light.h
index 910e3383f51..e0b0500dc78 100644
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/film/light_passes.h"
+#include "kernel/integrator/surface_shader.h"
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
 
@@ -18,6 +18,8 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
   Intersection isect ccl_optional_struct_init;
   integrator_state_read_isect(kg, state, &isect);
 
+  guiding_record_light_surface_segment(kg, state, &isect);
+
   float3 ray_P = INTEGRATOR_STATE(state, ray, P);
   const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
   const float ray_time = INTEGRATOR_STATE(state, ray, time);
@@ -51,23 +53,23 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
   /* TODO: does aliasing like this break automatic SoA in CUDA? */
   ShaderDataTinyStorage emission_sd_storage;
   ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
+  Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
   if (is_zero(light_eval)) {
     return;
   }
 
   /* MIS weighting. */
+  float mis_weight = 1.0f;
   if (!(path_flag & PATH_RAY_MIS_SKIP)) {
     /* multiple importance sampling, get regular light pdf,
      * and compute weight with respect to BSDF pdf */
     const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
-    light_eval *= mis_weight;
+    mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
   }
 
   /* Write to render buffer. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput * light_eval, render_buffer, ls.group);
+  guiding_record_surface_emission(kg, state, light_eval, mis_weight);
+  film_write_surface_emission(kg, state, light_eval, mis_weight, render_buffer, ls.group);
 }
 
 ccl_device void integrator_shade_light(KernelGlobals kg,
diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h
index 4b002a47bee..bedb15ddf89 100644
--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -3,8 +3,9 @@
 
 #pragma once
 
+#include "kernel/integrator/guiding.h"
 #include "kernel/integrator/shade_volume.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/surface_shader.h"
 #include "kernel/integrator/volume_stack.h"
 
 CCL_NAMESPACE_BEGIN
@@ -15,9 +16,9 @@ ccl_device_inline bool shadow_intersections_has_remaining(const uint num_hits)
 }
 
 #ifdef __TRANSPARENT_SHADOWS__
-ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
-                                                              IntegratorShadowState state,
-                                                              const int hit)
+ccl_device_inline Spectrum integrate_transparent_surface_shadow(KernelGlobals kg,
+                                                                IntegratorShadowState state,
+                                                                const int hit)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
 
@@ -40,7 +41,7 @@ ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
 
   /* Evaluate shader. */
   if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+    surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
         kg, state, shadow_sd, NULL, PATH_RAY_SHADOW);
   }
 
@@ -50,7 +51,7 @@ ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
 #  endif
 
   /* Compute transparency from closures. */
-  return shader_bsdf_transparency(kg, shadow_sd);
+  return surface_shader_transparency(kg, shadow_sd);
 }
 
 #  ifdef __VOLUME__
@@ -58,7 +59,7 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
                                                            IntegratorShadowState state,
                                                            const int hit,
                                                            const int num_recorded_hits,
-                                                           ccl_private float3 *ccl_restrict
+                                                           ccl_private Spectrum *ccl_restrict
                                                                throughput)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME);
@@ -100,7 +101,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
     if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) {
 #  ifdef __VOLUME__
       if (!integrator_state_shadow_volume_stack_is_empty(kg, state)) {
-        float3 throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
+        Spectrum throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
         integrate_transparent_volume_shadow(kg, state, hit, num_recorded_hits, &throughput);
         if (is_zero(throughput)) {
           return true;
@@ -113,8 +114,8 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
 
     /* Surface shaders. */
     if (hit < num_recorded_hits) {
-      const float3 shadow = integrate_transparent_surface_shadow(kg, state, hit);
-      const float3 throughput = INTEGRATOR_STATE(state, shadow_path, throughput) * shadow;
+      const Spectrum shadow = integrate_transparent_surface_shadow(kg, state, hit);
+      const Spectrum throughput = INTEGRATOR_STATE(state, shadow_path, throughput) * shadow;
       if (is_zero(throughput)) {
         return true;
       }
@@ -165,7 +166,8 @@ ccl_device void integrator_shade_shadow(KernelGlobals kg,
     return;
   }
   else {
-    kernel_accum_light(kg, state, render_buffer);
+    guiding_record_direct_light(kg, state);
+    film_write_direct_light(kg, state, render_buffer);
     integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
     return;
   }
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index 1514b3956ad..067d35ef9e3 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -3,14 +3,16 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/film/passes.h"
+#include "kernel/film/data_passes.h"
+#include "kernel/film/denoising_passes.h"
+#include "kernel/film/light_passes.h"
 
 #include "kernel/integrator/mnee.h"
 
+#include "kernel/integrator/guiding.h"
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
 #include "kernel/integrator/subsurface.h"
+#include "kernel/integrator/surface_shader.h"
 #include "kernel/integrator/volume_stack.h"
 
 #include "kernel/light/light.h"
@@ -31,7 +33,52 @@ ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
   shader_setup_from_ray(kg, sd, &ray, &isect);
 }
 
-#ifdef __HOLDOUT__
+ccl_device_forceinline float3 integrate_surface_ray_offset(KernelGlobals kg,
+                                                           const ccl_private ShaderData *sd,
+                                                           const float3 ray_P,
+                                                           const float3 ray_D)
+{
+  /* No ray offset needed for other primitive types. */
+  if (!(sd->type & PRIMITIVE_TRIANGLE)) {
+    return ray_P;
+  }
+
+  /* Self intersection tests already account for the case where a ray hits the
+   * same primitive. However precision issues can still cause neighboring
+   * triangles to be hit. Here we test if the ray-triangle intersection with
+   * the same primitive would miss, implying that a neighboring triangle would
+   * be hit instead.
+   *
+   * This relies on triangle intersection to be watertight, and the object inverse
+   * object transform to match the one used by ray intersection exactly.
+   *
+   * Potential improvements:
+   * - It appears this happens when either barycentric coordinates are small,
+   *   or dot(sd->Ng, ray_D)  is small. Detect such cases and skip test?
+   * - Instead of ray offset, can we tweak P to lie within the triangle?
+   */
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, sd->prim).w;
+  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
+                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
+                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+
+  float3 local_ray_P = ray_P;
+  float3 local_ray_D = ray_D;
+
+  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    const Transform itfm = object_get_inverse_transform(kg, sd);
+    local_ray_P = transform_point(&itfm, local_ray_P);
+    local_ray_D = transform_direction(&itfm, local_ray_D);
+  }
+
+  if (ray_triangle_intersect_self(local_ray_P, local_ray_D, tri_a, tri_b, tri_c)) {
+    return ray_P;
+  }
+  else {
+    return ray_offset(ray_P, sd->Ng);
+  }
+}
+
 ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg,
                                                       ConstIntegratorState state,
                                                       ccl_private ShaderData *sd,
@@ -42,24 +89,20 @@ ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg,
 
   if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
       (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    const float3 holdout_weight = shader_holdout_apply(kg, sd);
-    if (kernel_data.background.transparent) {
-      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      const float transparent = average(holdout_weight * throughput);
-      kernel_accum_holdout(kg, state, path_flag, transparent, render_buffer);
-    }
-    if (isequal(holdout_weight, one_float3())) {
+    const Spectrum holdout_weight = surface_shader_apply_holdout(kg, sd);
+    const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+    const float transparent = average(holdout_weight * throughput);
+    film_write_holdout(kg, state, path_flag, transparent, render_buffer);
+    if (isequal(holdout_weight, one_spectrum())) {
       return false;
     }
   }
 
   return true;
 }
-#endif /* __HOLDOUT__ */
 
-#ifdef __EMISSION__
 ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
-                                                       ConstIntegratorState state,
+                                                       IntegratorState state,
                                                        ccl_private const ShaderData *sd,
                                                        ccl_global float *ccl_restrict
                                                            render_buffer)
@@ -67,14 +110,15 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
 
   /* Evaluate emissive closure. */
-  float3 L = shader_emissive_eval(sd);
+  Spectrum L = surface_shader_emission(sd);
+  float mis_weight = 1.0f;
 
-#  ifdef __HAIR__
+#ifdef __HAIR__
   if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
       (sd->type & PRIMITIVE_TRIANGLE))
-#  else
+#else
   if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
-#  endif
+#endif
   {
     const float bsdf_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
     const float t = sd->ray_length;
@@ -82,17 +126,14 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
     /* Multiple importance sampling, get triangle light pdf,
      * and compute weight with respect to BSDF pdf. */
     float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
-    L *= mis_weight;
+    mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
   }
 
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(
-      kg, state, throughput * L, render_buffer, object_lightgroup(kg, sd->object));
+  guiding_record_surface_emission(kg, state, L, mis_weight);
+  film_write_surface_emission(
+      kg, state, L, mis_weight, render_buffer, object_lightgroup(kg, sd->object));
 }
-#endif /* __EMISSION__ */
 
-#ifdef __EMISSION__
 /* Path tracing: sample point on light and evaluate light shader, then
  * queue shadow ray to be traced. */
 template<uint node_feature_mask>
@@ -111,11 +152,10 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   {
     const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
     const uint bounce = INTEGRATOR_STATE(state, path, bounce);
-    float light_u, light_v;
-    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+    const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);
 
     if (!light_distribution_sample_from_position(
-            kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) {
+            kg, rand_light.x, rand_light.y, sd->time, sd->P, bounce, path_flag, &ls)) {
       return;
     }
   }
@@ -133,9 +173,10 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
 
   Ray ray ccl_optional_struct_init;
   BsdfEval bsdf_eval ccl_optional_struct_init;
-  const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D);
 
-#  ifdef __MNEE__
+  const bool is_transmission = dot(ls.D, sd->N) < 0.0f;
+
+#ifdef __MNEE__
   int mnee_vertex_count = 0;
   IF_KERNEL_FEATURE(MNEE)
   {
@@ -144,13 +185,15 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
       const bool use_caustics = kernel_data_fetch(lights, ls.lamp).use_caustics;
       if (use_caustics) {
         /* Are we on a caustic caster? */
-        if (is_transmission && (sd->object_flag & SD_OBJECT_CAUSTICS_CASTER))
+        if (is_transmission && (sd->object_flag & SD_OBJECT_CAUSTICS_CASTER)) {
           return;
+        }
 
         /* Are we on a caustic receiver? */
-        if (!is_transmission && (sd->object_flag & SD_OBJECT_CAUSTICS_RECEIVER))
+        if (!is_transmission && (sd->object_flag & SD_OBJECT_CAUSTICS_RECEIVER)) {
           mnee_vertex_count = kernel_path_mnee_sample(
               kg, state, sd, emission_sd, rng_state, &ls, &bsdf_eval);
+        }
       }
     }
   }
@@ -161,15 +204,15 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
     light_sample_to_surface_shadow_ray(kg, emission_sd, &ls, &ray);
   }
   else
-#  endif /* __MNEE__ */
+#endif /* __MNEE__ */
   {
-    const float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, sd->time);
+    const Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, sd->time);
     if (is_zero(light_eval)) {
       return;
     }
 
     /* Evaluate BSDF. */
-    const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
+    const float bsdf_pdf = surface_shader_bsdf_eval(kg, state, sd, ls.D, &bsdf_eval, ls.shader);
     bsdf_eval_mul(&bsdf_eval, light_eval / ls.pdf);
 
     if (ls.shader & SHADER_USE_MIS) {
@@ -197,9 +240,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
 
   if (is_transmission) {
-#  ifdef __VOLUME__
+#ifdef __VOLUME__
     shadow_volume_stack_enter_exit(kg, shadow_state, sd);
-#  endif
+#endif
+  }
+
+  if (ray.self.object != OBJECT_NONE) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
   }
 
   /* Write shadow ray and associated state to global memory. */
@@ -213,11 +260,12 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   /* Copy state from main path to shadow path. */
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);
+  const Spectrum unlit_throughput = INTEGRATOR_STATE(state, path, throughput);
+  const Spectrum throughput = unlit_throughput * bsdf_eval_sum(&bsdf_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    packed_float3 pass_diffuse_weight;
-    packed_float3 pass_glossy_weight;
+    PackedSpectrum pass_diffuse_weight;
+    PackedSpectrum pass_glossy_weight;
 
     if (shadow_flag & PATH_RAY_ANY_PASS) {
       /* Indirect bounce, use weights from earlier surface or volume bounce. */
@@ -227,8 +275,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
     else {
       /* Direct light, use BSDFs at this bounce. */
       shadow_flag |= PATH_RAY_SURFACE_PASS;
-      pass_diffuse_weight = packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval));
-      pass_glossy_weight = packed_float3(bsdf_eval_pass_glossy_weight(&bsdf_eval));
+      pass_diffuse_weight = PackedSpectrum(bsdf_eval_pass_diffuse_weight(&bsdf_eval));
+      pass_glossy_weight = PackedSpectrum(bsdf_eval_pass_glossy_weight(&bsdf_eval));
     }
 
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
@@ -250,7 +298,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, glossy_bounce) = INTEGRATOR_STATE(
       state, path, glossy_bounce);
 
-#  ifdef __MNEE__
+#ifdef __MNEE__
   if (mnee_vertex_count > 0) {
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) =
         INTEGRATOR_STATE(state, path, transmission_bounce) + mnee_vertex_count - 1;
@@ -262,7 +310,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
                            bounce) = INTEGRATOR_STATE(state, path, bounce) + mnee_vertex_count;
   }
   else
-#  endif
+#endif
   {
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) = INTEGRATOR_STATE(
         state, path, transmission_bounce);
@@ -283,8 +331,12 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
       shadow_state, shadow_path, lightgroup) = (ls.type != LIGHT_BACKGROUND) ?
                                                    ls.group + 1 :
                                                    kernel_data.background.lightgroup + 1;
-}
+#ifdef __PATH_GUIDING__
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unlit_throughput) = unlit_throughput;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = INTEGRATOR_STATE(
+      state, guiding, path_segment);
 #endif
+}
 
 /* Path tracing: bounce off or through surface with new direction. */
 ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
@@ -298,9 +350,8 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
     return LABEL_NONE;
   }
 
-  float bsdf_u, bsdf_v;
-  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-  ccl_private const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u);
+  float2 rand_bsdf = path_state_rng_2D(kg, rng_state, PRNG_SURFACE_BSDF);
+  ccl_private const ShaderClosure *sc = surface_shader_bsdf_bssrdf_pick(sd, &rand_bsdf);
 
 #ifdef __SUBSURFACE__
   /* BSSRDF closure, we schedule subsurface intersection kernel. */
@@ -310,17 +361,52 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
 #endif
 
   /* BSDF closure, sample direction. */
-  float bsdf_pdf;
+  float bsdf_pdf = 0.0f, unguided_bsdf_pdf = 0.0f;
   BsdfEval bsdf_eval ccl_optional_struct_init;
   float3 bsdf_omega_in ccl_optional_struct_init;
-  differential3 bsdf_domega_in ccl_optional_struct_init;
   int label;
 
-  label = shader_bsdf_sample_closure(
-      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+  float2 bsdf_sampled_roughness = make_float2(1.0f, 1.0f);
+  float bsdf_eta = 1.0f;
+
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  if (kernel_data.integrator.use_surface_guiding) {
+    label = surface_shader_bsdf_guided_sample_closure(kg,
+                                                      state,
+                                                      sd,
+                                                      sc,
+                                                      rand_bsdf,
+                                                      &bsdf_eval,
+                                                      &bsdf_omega_in,
+                                                      &bsdf_pdf,
+                                                      &unguided_bsdf_pdf,
+                                                      &bsdf_sampled_roughness,
+                                                      &bsdf_eta);
+
+    if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
+      return LABEL_NONE;
+    }
 
-  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
-    return LABEL_NONE;
+    INTEGRATOR_STATE_WRITE(state, path, unguided_throughput) *= bsdf_pdf / unguided_bsdf_pdf;
+  }
+  else
+#endif
+  {
+    label = surface_shader_bsdf_sample_closure(kg,
+                                               sd,
+                                               sc,
+                                               rand_bsdf,
+                                               &bsdf_eval,
+                                               &bsdf_omega_in,
+                                               &bsdf_pdf,
+                                               &bsdf_sampled_roughness,
+                                               &bsdf_eta);
+
+    if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
+      return LABEL_NONE;
+    }
+
+    unguided_bsdf_pdf = bsdf_pdf;
   }
 
   if (label & LABEL_TRANSPARENT) {
@@ -329,20 +415,19 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
   }
   else {
     /* Setup ray with changed origin and direction. */
-    INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
-    INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
+    const float3 D = normalize(bsdf_omega_in);
+    INTEGRATOR_STATE_WRITE(state, ray, P) = integrate_surface_ray_offset(kg, sd, sd->P, D);
+    INTEGRATOR_STATE_WRITE(state, ray, D) = D;
     INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
     INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
     INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
-    INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(bsdf_domega_in);
 #endif
   }
 
   /* Update throughput. */
-  float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
-  INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput;
+  const Spectrum bsdf_weight = bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
+  INTEGRATOR_STATE_WRITE(state, path, throughput) *= bsdf_weight;
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
     if (INTEGRATOR_STATE(state, path, bounce) == 0) {
@@ -357,10 +442,21 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
   if (!(label & LABEL_TRANSPARENT)) {
     INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = bsdf_pdf;
     INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
-        bsdf_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
+        unguided_bsdf_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
   }
 
   path_state_next(kg, state, label);
+
+  guiding_record_surface_bounce(kg,
+                                state,
+                                sd,
+                                bsdf_weight,
+                                bsdf_pdf,
+                                sd->N,
+                                normalize(bsdf_omega_in),
+                                bsdf_sampled_roughness,
+                                bsdf_eta);
+
   return label;
 }
 
@@ -382,14 +478,15 @@ ccl_device_forceinline int integrate_surface_volume_only_bounce(IntegratorState
 ccl_device_forceinline bool integrate_surface_terminate(IntegratorState state,
                                                         const uint32_t path_flag)
 {
-  const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
-                                0.0f :
-                                INTEGRATOR_STATE(state, path, continuation_probability);
-  if (probability == 0.0f) {
+  const float continuation_probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+                                             0.0f :
+                                             INTEGRATOR_STATE(
+                                                 state, path, continuation_probability);
+  if (continuation_probability == 0.0f) {
     return true;
   }
-  else if (probability != 1.0f) {
-    INTEGRATOR_STATE_WRITE(state, path, throughput) /= probability;
+  else if (continuation_probability != 1.0f) {
+    INTEGRATOR_STATE_WRITE(state, path, throughput) /= continuation_probability;
   }
 
   return false;
@@ -408,22 +505,24 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
     return;
   }
 
-  float bsdf_u, bsdf_v;
-  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+  const float2 rand_bsdf = path_state_rng_2D(kg, rng_state, PRNG_SURFACE_BSDF);
 
   float3 ao_N;
-  const float3 ao_weight = shader_bsdf_ao(
+  const Spectrum ao_weight = surface_shader_ao(
       kg, sd, kernel_data.integrator.ao_additive_factor, &ao_N);
 
   float3 ao_D;
   float ao_pdf;
-  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+  sample_cos_hemisphere(ao_N, rand_bsdf.x, rand_bsdf.y, &ao_D, &ao_pdf);
 
   bool skip_self = true;
 
   Ray ray ccl_optional_struct_init;
   ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self);
   ray.D = ao_D;
+  if (skip_self) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
+  }
   ray.tmin = 0.0f;
   ray.tmax = kernel_data.integrator.ao_bounces_distance;
   ray.time = sd->time;
@@ -452,7 +551,8 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
   const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
   const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag) | PATH_RAY_SHADOW_FOR_AO;
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * shader_bsdf_alpha(kg, sd);
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput) *
+                              surface_shader_alpha(kg, sd);
 
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
       state, path, render_pixel_index);
@@ -494,6 +594,8 @@ ccl_device bool integrate_surface(KernelGlobals kg,
 #ifdef __VOLUME__
   if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
 #endif
+    guiding_record_surface_segment(kg, state, &sd);
+
 #ifdef __SUBSURFACE__
     /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
     if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
@@ -501,7 +603,7 @@ ccl_device bool integrate_surface(KernelGlobals kg,
     {
       /* Evaluate shader. */
       PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
-      shader_eval_surface<node_feature_mask>(kg, state, &sd, render_buffer, path_flag);
+      surface_shader_eval<node_feature_mask>(kg, state, &sd, render_buffer, path_flag);
 
       /* Initialize additional RNG for BSDFs. */
       if (sd.flag & SD_BSDF_NEEDS_LCG) {
@@ -523,21 +625,17 @@ ccl_device bool integrate_surface(KernelGlobals kg,
 #endif
     {
       /* Filter closures. */
-      shader_prepare_surface_closures(kg, state, &sd, path_flag);
+      surface_shader_prepare_closures(kg, state, &sd, path_flag);
 
-#ifdef __HOLDOUT__
       /* Evaluate holdout. */
       if (!integrate_surface_holdout(kg, state, &sd, render_buffer)) {
         return false;
       }
-#endif
 
-#ifdef __EMISSION__
       /* Write emission. */
       if (sd.flag & SD_EMISSION) {
         integrate_surface_emission(kg, state, &sd, render_buffer);
       }
-#endif
 
       /* Perform path termination. Most paths have already been terminated in
        * the intersect_closest kernel, this is just for emission and for dividing
@@ -551,11 +649,11 @@ ccl_device bool integrate_surface(KernelGlobals kg,
       /* Write render passes. */
 #ifdef __PASSES__
       PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES);
-      kernel_write_data_passes(kg, state, &sd, render_buffer);
+      film_write_data_passes(kg, state, &sd, render_buffer);
 #endif
 
 #ifdef __DENOISING_FEATURES__
-      kernel_write_denoising_features_surface(kg, state, &sd, render_buffer);
+      film_write_denoising_features_surface(kg, state, &sd, render_buffer);
 #endif
     }
 
@@ -563,6 +661,10 @@ ccl_device bool integrate_surface(KernelGlobals kg,
     RNGState rng_state;
     path_state_rng_load(state, &rng_state);
 
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+    surface_shader_prepare_guiding(kg, state, &sd, &rng_state);
+    guiding_write_debug_passes(kg, state, &sd, render_buffer);
+#endif
     /* Direct light. */
     PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
     integrate_surface_direct_light<node_feature_mask>(kg, state, &sd, &rng_state);
diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index 4aab097a7d8..a8324cda2dc 100644
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -3,12 +3,14 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/film/passes.h"
+#include "kernel/film/data_passes.h"
+#include "kernel/film/denoising_passes.h"
+#include "kernel/film/light_passes.h"
 
+#include "kernel/integrator/guiding.h"
 #include "kernel/integrator/intersect_closest.h"
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/volume_shader.h"
 #include "kernel/integrator/volume_stack.h"
 
 #include "kernel/light/light.h"
@@ -29,13 +31,13 @@ typedef enum VolumeIntegrateEvent {
 typedef struct VolumeIntegrateResult {
   /* Throughput and offset for direct light scattering. */
   bool direct_scatter;
-  float3 direct_throughput;
+  Spectrum direct_throughput;
   float direct_t;
   ShaderVolumePhases direct_phases;
 
   /* Throughput and offset for indirect light scattering. */
   bool indirect_scatter;
-  float3 indirect_throughput;
+  Spectrum indirect_throughput;
   float indirect_t;
   ShaderVolumePhases indirect_phases;
 } VolumeIntegrateResult;
@@ -52,19 +54,19 @@ typedef struct VolumeIntegrateResult {
  * sigma_t = sigma_a + sigma_s */
 
 typedef struct VolumeShaderCoefficients {
-  float3 sigma_t;
-  float3 sigma_s;
-  float3 emission;
+  Spectrum sigma_t;
+  Spectrum sigma_s;
+  Spectrum emission;
 } VolumeShaderCoefficients;
 
 /* Evaluate shader to get extinction coefficient at P. */
 ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
                                                    IntegratorShadowState state,
                                                    ccl_private ShaderData *ccl_restrict sd,
-                                                   ccl_private float3 *ccl_restrict extinction)
+                                                   ccl_private Spectrum *ccl_restrict extinction)
 {
   VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
-  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);
+  volume_shader_eval<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);
 
   if (!(sd->flag & SD_EXTINCTION)) {
     return false;
@@ -83,15 +85,16 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
 {
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
   VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
-  shader_eval_volume<false>(kg, state, sd, path_flag, volume_read_lambda_pass);
+  volume_shader_eval<false>(kg, state, sd, path_flag, volume_read_lambda_pass);
 
   if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
     return false;
   }
 
-  coeff->sigma_s = zero_float3();
-  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
-  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
+  coeff->sigma_s = zero_spectrum();
+  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction :
+                                                zero_spectrum();
+  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_spectrum();
 
   if (sd->flag & SD_SCATTER) {
     for (int i = 0; i < sd->num_closure; i++) {
@@ -143,11 +146,11 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
 
     /* Perform shading at this offset within a step, to integrate over
      * over the entire step segment. */
-    *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4);
+    *step_shade_offset = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_SHADE_OFFSET);
 
     /* Shift starting point of all segment by this random amount to avoid
      * banding artifacts from the volume bounding shape. */
-    *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3);
+    *steps_offset = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_OFFSET);
   }
 }
 
@@ -162,9 +165,9 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
 ccl_device void volume_shadow_homogeneous(KernelGlobals kg, IntegratorState state,
                                           ccl_private Ray *ccl_restrict ray,
                                           ccl_private ShaderData *ccl_restrict sd,
-                                          ccl_global float3 *ccl_restrict throughput)
+                                          ccl_global Spectrum *ccl_restrict throughput)
 {
-  float3 sigma_t = zero_float3();
+  Spectrum sigma_t = zero_spectrum();
 
   if (shadow_volume_shader_sample(kg, state, sd, &sigma_t)) {
     *throughput *= volume_color_transmittance(sigma_t, ray->tmax - ray->tmin);
@@ -178,14 +181,14 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
                                             IntegratorShadowState state,
                                             ccl_private Ray *ccl_restrict ray,
                                             ccl_private ShaderData *ccl_restrict sd,
-                                            ccl_private float3 *ccl_restrict throughput,
+                                            ccl_private Spectrum *ccl_restrict throughput,
                                             const float object_step_size)
 {
   /* Load random number state. */
   RNGState rng_state;
   shadow_path_state_rng_load(state, &rng_state);
 
-  float3 tp = *throughput;
+  Spectrum tp = *throughput;
 
   /* Prepare for stepping.
    * For shadows we do not offset all segments, since the starting point is
@@ -207,7 +210,7 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
   /* compute extinction at the start */
   float t = ray->tmin;
 
-  float3 sum = zero_float3();
+  Spectrum sum = zero_spectrum();
 
   for (int i = 0; i < max_steps; i++) {
     /* advance to new position */
@@ -215,7 +218,7 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
     float dt = new_t - t;
 
     float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    float3 sigma_t = zero_float3();
+    Spectrum sigma_t = zero_spectrum();
 
     /* compute attenuation over segment */
     sd->P = new_P;
@@ -228,8 +231,7 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
         tp = *throughput * exp(sum);
 
         /* stop if nearly all light is blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON)
+        if (reduce_max(tp) < VOLUME_THROUGHPUT_EPSILON)
           break;
       }
     }
@@ -334,22 +336,22 @@ ccl_device float volume_equiangular_cdf(ccl_private const Ray *ccl_restrict ray,
 /* Distance sampling */
 
 ccl_device float volume_distance_sample(float max_t,
-                                        float3 sigma_t,
+                                        Spectrum sigma_t,
                                         int channel,
                                         float xi,
-                                        ccl_private float3 *transmittance,
-                                        ccl_private float3 *pdf)
+                                        ccl_private Spectrum *transmittance,
+                                        ccl_private Spectrum *pdf)
 {
   /* xi is [0, 1[ so log(0) should never happen, division by zero is
    * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
   float sample_sigma_t = volume_channel_get(sigma_t, channel);
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  Spectrum full_transmittance = volume_color_transmittance(sigma_t, max_t);
   float sample_transmittance = volume_channel_get(full_transmittance, channel);
 
   float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
 
   *transmittance = volume_color_transmittance(sigma_t, sample_t);
-  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
+  *pdf = safe_divide_color(sigma_t * *transmittance, one_spectrum() - full_transmittance);
 
   /* todo: optimization: when taken together with hit/miss decision,
    * the full_transmittance cancels out drops out and xi does not
@@ -358,33 +360,36 @@ ccl_device float volume_distance_sample(float max_t,
   return sample_t;
 }
 
-ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+ccl_device Spectrum volume_distance_pdf(float max_t, Spectrum sigma_t, float sample_t)
 {
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+  Spectrum full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  Spectrum transmittance = volume_color_transmittance(sigma_t, sample_t);
 
-  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
+  return safe_divide_color(sigma_t * transmittance, one_spectrum() - full_transmittance);
 }
 
 /* Emission */
 
-ccl_device float3 volume_emission_integrate(ccl_private VolumeShaderCoefficients *coeff,
-                                            int closure_flag,
-                                            float3 transmittance,
-                                            float t)
+ccl_device Spectrum volume_emission_integrate(ccl_private VolumeShaderCoefficients *coeff,
+                                              int closure_flag,
+                                              Spectrum transmittance,
+                                              float t)
 {
   /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
    * this goes to E * t as sigma_t goes to zero
    *
    * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
-  float3 emission = coeff->emission;
+  Spectrum emission = coeff->emission;
 
   if (closure_flag & SD_EXTINCTION) {
-    float3 sigma_t = coeff->sigma_t;
+    Spectrum sigma_t = coeff->sigma_t;
 
-    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
-    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
-    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
+    FOREACH_SPECTRUM_CHANNEL (i) {
+      GET_SPECTRUM_CHANNEL(emission, i) *= (GET_SPECTRUM_CHANNEL(sigma_t, i) > 0.0f) ?
+                                               (1.0f - GET_SPECTRUM_CHANNEL(transmittance, i)) /
+                                                   GET_SPECTRUM_CHANNEL(sigma_t, i) :
+                                               t;
+    }
   }
   else
     emission *= t;
@@ -419,14 +424,14 @@ ccl_device_forceinline void volume_integrate_step_scattering(
     ccl_private const Ray *ray,
     const float3 equiangular_light_P,
     ccl_private const VolumeShaderCoefficients &ccl_restrict coeff,
-    const float3 transmittance,
+    const Spectrum transmittance,
     ccl_private VolumeIntegrateState &ccl_restrict vstate,
     ccl_private VolumeIntegrateResult &ccl_restrict result)
 {
   /* Pick random color channel, we use the Veach one-sample
    * model with balance heuristic for the channels. */
-  const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-  float3 channel_pdf;
+  const Spectrum albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+  Spectrum channel_pdf;
   const int channel = volume_sample_channel(
       albedo, result.indirect_throughput, vstate.rphase, &channel_pdf);
 
@@ -435,11 +440,11 @@ ccl_device_forceinline void volume_integrate_step_scattering(
     if (result.direct_t >= vstate.tmin && result.direct_t <= vstate.tmax &&
         vstate.equiangular_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
       const float new_dt = result.direct_t - vstate.tmin;
-      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const Spectrum new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
 
       result.direct_scatter = true;
       result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf;
-      shader_copy_volume_phases(&result.direct_phases, sd);
+      volume_shader_copy_phases(&result.direct_phases, sd);
 
       /* Multiple importance sampling. */
       if (vstate.use_mis) {
@@ -467,7 +472,7 @@ ccl_device_forceinline void volume_integrate_step_scattering(
       const float new_t = vstate.tmin + new_dt;
 
       /* transmittance and pdf */
-      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const Spectrum new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
       const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);
 
       if (vstate.distance_pdf * distance_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
@@ -475,7 +480,7 @@ ccl_device_forceinline void volume_integrate_step_scattering(
         result.indirect_scatter = true;
         result.indirect_t = new_t;
         result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
-        shader_copy_volume_phases(&result.indirect_phases, sd);
+        volume_shader_copy_phases(&result.indirect_phases, sd);
 
         if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
           /* If using distance sampling for direct light, just copy parameters
@@ -483,7 +488,7 @@ ccl_device_forceinline void volume_integrate_step_scattering(
           result.direct_scatter = true;
           result.direct_t = result.indirect_t;
           result.direct_throughput = result.indirect_throughput;
-          shader_copy_volume_phases(&result.direct_phases, sd);
+          volume_shader_copy_phases(&result.direct_phases, sd);
 
           /* Multiple importance sampling. */
           if (vstate.use_mis) {
@@ -546,8 +551,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
   vstate.tmin = ray->tmin;
   vstate.tmax = ray->tmin;
   vstate.absorption_only = true;
-  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
-  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
+  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_SCATTER_DISTANCE);
+  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_PHASE_CHANNEL);
 
   /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
   vstate.direct_sample_method = direct_sample_method;
@@ -566,7 +571,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
   vstate.distance_pdf = 1.0f;
 
   /* Initialize volume integration result. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
   result.direct_throughput = throughput;
   result.indirect_throughput = throughput;
 
@@ -579,9 +584,9 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 #  ifdef __DENOISING_FEATURES__
   const bool write_denoising_features = (INTEGRATOR_STATE(state, path, flag) &
                                          PATH_RAY_DENOISING_FEATURES);
-  float3 accum_albedo = zero_float3();
+  Spectrum accum_albedo = zero_spectrum();
 #  endif
-  float3 accum_emission = zero_float3();
+  Spectrum accum_emission = zero_spectrum();
 
   for (int i = 0; i < max_steps; i++) {
     /* Advance to new position */
@@ -596,18 +601,19 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 
       /* Evaluate transmittance over segment. */
       const float dt = (vstate.tmax - vstate.tmin);
-      const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
-                                       volume_color_transmittance(coeff.sigma_t, dt) :
-                                       one_float3();
+      const Spectrum transmittance = (closure_flag & SD_EXTINCTION) ?
+                                         volume_color_transmittance(coeff.sigma_t, dt) :
+                                         one_spectrum();
 
       /* Emission. */
       if (closure_flag & SD_EMISSION) {
         /* Only write emission before indirect light scatter position, since we terminate
          * stepping at that point if we have already found a direct light scatter position. */
         if (!result.indirect_scatter) {
-          const float3 emission = volume_emission_integrate(
+          const Spectrum emission = volume_emission_integrate(
               &coeff, closure_flag, transmittance, dt);
           accum_emission += result.indirect_throughput * emission;
+          guiding_record_volume_emission(kg, state, emission);
         }
       }
 
@@ -616,8 +622,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 #  ifdef __DENOISING_FEATURES__
           /* Accumulate albedo for denoising features. */
           if (write_denoising_features && (closure_flag & SD_SCATTER)) {
-            const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-            accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance);
+            const Spectrum albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+            accum_albedo += result.indirect_throughput * albedo * (one_spectrum() - transmittance);
           }
 #  endif
 
@@ -634,7 +640,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
         /* Stop if nearly all light blocked. */
         if (!result.indirect_scatter) {
           if (reduce_max(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
-            result.indirect_throughput = zero_float3();
+            result.indirect_throughput = zero_spectrum();
             break;
           }
         }
@@ -660,20 +666,19 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 
   /* Write accumulated emission. */
   if (!is_zero(accum_emission)) {
-    kernel_accum_emission(
+    film_write_volume_emission(
         kg, state, accum_emission, render_buffer, object_lightgroup(kg, sd->object));
   }
 
 #  ifdef __DENOISING_FEATURES__
   /* Write denoising features. */
   if (write_denoising_features) {
-    kernel_write_denoising_features_volume(
+    film_write_denoising_features_volume(
         kg, state, accum_albedo, result.indirect_scatter, render_buffer);
   }
 #  endif /* __DENOISING_FEATURES__ */
 }
 
-#  ifdef __EMISSION__
 /* Path tracing: sample point on light and evaluate light shader, then
  * queue shadow ray to be traced. */
 ccl_device_forceinline bool integrate_volume_sample_light(
@@ -691,11 +696,10 @@ ccl_device_forceinline bool integrate_volume_sample_light(
   /* Sample position on a light. */
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
   const uint bounce = INTEGRATOR_STATE(state, path, bounce);
-  float light_u, light_v;
-  path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+  const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);
 
   if (!light_distribution_sample_from_volume_segment(
-          kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls)) {
+          kg, rand_light.x, rand_light.y, sd->time, sd->P, bounce, path_flag, ls)) {
     return false;
   }
 
@@ -715,7 +719,7 @@ ccl_device_forceinline void integrate_volume_direct_light(
     ccl_private const RNGState *ccl_restrict rng_state,
     const float3 P,
     ccl_private const ShaderVolumePhases *ccl_restrict phases,
-    ccl_private const float3 throughput,
+    ccl_private const Spectrum throughput,
     ccl_private LightSample *ccl_restrict ls)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);
@@ -732,11 +736,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
   {
     const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
     const uint bounce = INTEGRATOR_STATE(state, path, bounce);
-    float light_u, light_v;
-    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+    const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);
 
     if (!light_distribution_sample_from_position(
-            kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) {
+            kg, rand_light.x, rand_light.y, sd->time, P, bounce, path_flag, ls)) {
       return;
     }
   }
@@ -753,14 +756,14 @@ ccl_device_forceinline void integrate_volume_direct_light(
    * non-constant light sources. */
   ShaderDataTinyStorage emission_sd_storage;
   ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  const float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, ls, sd->time);
+  const Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, ls, sd->time);
   if (is_zero(light_eval)) {
     return;
   }
 
   /* Evaluate BSDF. */
   BsdfEval phase_eval ccl_optional_struct_init;
-  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
+  float phase_pdf = volume_shader_phase_eval(kg, state, sd, phases, ls->D, &phase_eval);
 
   if (ls->shader & SHADER_USE_MIS) {
     float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf);
@@ -796,11 +799,11 @@ ccl_device_forceinline void integrate_volume_direct_light(
   const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
+  const Spectrum throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    packed_float3 pass_diffuse_weight;
-    packed_float3 pass_glossy_weight;
+    PackedSpectrum pass_diffuse_weight;
+    PackedSpectrum pass_glossy_weight;
 
     if (shadow_flag & PATH_RAY_ANY_PASS) {
       /* Indirect bounce, use weights from earlier surface or volume bounce. */
@@ -810,8 +813,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
     else {
       /* Direct light, no diffuse/glossy distinction needed for volumes. */
       shadow_flag |= PATH_RAY_VOLUME_PASS;
-      pass_diffuse_weight = packed_float3(one_float3());
-      pass_glossy_weight = packed_float3(zero_float3());
+      pass_diffuse_weight = one_spectrum();
+      pass_glossy_weight = zero_spectrum();
     }
 
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
@@ -847,9 +850,14 @@ ccl_device_forceinline void integrate_volume_direct_light(
                                                    ls->group + 1 :
                                                    kernel_data.background.lightgroup + 1;
 
+#  ifdef __PATH_GUIDING__
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unlit_throughput) = throughput;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = INTEGRATOR_STATE(
+      state, guiding, path_segment);
+#  endif
+
   integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
 }
-#  endif
 
 /* Path tracing: scatter in new direction using phase function */
 ccl_device_forceinline bool integrate_volume_phase_scatter(
@@ -861,27 +869,54 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
 {
   PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT);
 
-  float phase_u, phase_v;
-  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v);
+  float2 rand_phase = path_state_rng_2D(kg, rng_state, PRNG_VOLUME_PHASE);
+
+  ccl_private const ShaderVolumeClosure *svc = volume_shader_phase_pick(phases, &rand_phase);
 
   /* Phase closure, sample direction. */
-  float phase_pdf;
+  float phase_pdf = 0.0f, unguided_phase_pdf = 0.0f;
   BsdfEval phase_eval ccl_optional_struct_init;
   float3 phase_omega_in ccl_optional_struct_init;
-  differential3 phase_domega_in ccl_optional_struct_init;
-
-  const int label = shader_volume_phase_sample(kg,
-                                               sd,
-                                               phases,
-                                               phase_u,
-                                               phase_v,
-                                               &phase_eval,
-                                               &phase_omega_in,
-                                               &phase_domega_in,
-                                               &phase_pdf);
-
-  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
-    return false;
+  float sampled_roughness = 1.0f;
+  int label;
+
+#  if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  if (kernel_data.integrator.use_guiding) {
+    label = volume_shader_phase_guided_sample(kg,
+                                              state,
+                                              sd,
+                                              svc,
+                                              rand_phase,
+                                              &phase_eval,
+                                              &phase_omega_in,
+                                              &phase_pdf,
+                                              &unguided_phase_pdf,
+                                              &sampled_roughness);
+
+    if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
+      return false;
+    }
+
+    INTEGRATOR_STATE_WRITE(state, path, unguided_throughput) *= phase_pdf / unguided_phase_pdf;
+  }
+  else
+#  endif
+  {
+    label = volume_shader_phase_sample(kg,
+                                       sd,
+                                       phases,
+                                       svc,
+                                       rand_phase,
+                                       &phase_eval,
+                                       &phase_omega_in,
+                                       &phase_pdf,
+                                       &sampled_roughness);
+
+    if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
+      return false;
+    }
+
+    unguided_phase_pdf = phase_pdf;
   }
 
   /* Setup ray. */
@@ -891,26 +926,31 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
   INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #  ifdef __RAY_DIFFERENTIALS__
   INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
-  INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(phase_domega_in);
 #  endif
   // Save memory by storing last hit prim and object in isect
   INTEGRATOR_STATE_WRITE(state, isect, prim) = sd->prim;
   INTEGRATOR_STATE_WRITE(state, isect, object) = sd->object;
 
+  const Spectrum phase_weight = bsdf_eval_sum(&phase_eval) / phase_pdf;
+
+  /* Add phase function sampling data to the path segment. */
+  guiding_record_volume_bounce(
+      kg, state, sd, phase_weight, phase_pdf, normalize(phase_omega_in), sampled_roughness);
+
   /* Update throughput. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+  const Spectrum throughput_phase = throughput * phase_weight;
   INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput_phase;
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-    INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+    INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_spectrum();
+    INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_spectrum();
   }
 
   /* Update path state */
   INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = phase_pdf;
   INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
-      phase_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
+      unguided_phase_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
 
   path_state_next(kg, state, label);
   return true;
@@ -949,6 +989,10 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
   VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
   const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);
 
+#  if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  const float3 initial_throughput = INTEGRATOR_STATE(state, path, throughput);
+#  endif
+
   /* TODO: expensive to zero closures? */
   VolumeIntegrateResult result = {};
   volume_integrate_heterogeneous(kg,
@@ -966,17 +1010,50 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
    * to be terminated. That will shading evaluating to leave out any scattering closures,
    * but emission and absorption are still handled for multiple importance sampling. */
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
-                                0.0f :
-                                INTEGRATOR_STATE(state, path, continuation_probability);
-  if (probability == 0.0f) {
+  const float continuation_probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
+                                             0.0f :
+                                             INTEGRATOR_STATE(
+                                                 state, path, continuation_probability);
+  if (continuation_probability == 0.0f) {
     return VOLUME_PATH_MISSED;
   }
 
+#  if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+  bool guiding_generated_new_segment = false;
+  if (kernel_data.integrator.use_guiding) {
+    /* Record transmittance using change in throughput. */
+    float3 transmittance_weight = spectrum_to_rgb(
+        safe_divide_color(result.indirect_throughput, initial_throughput));
+    guiding_record_volume_transmission(kg, state, transmittance_weight);
+
+    if (result.indirect_scatter) {
+      const float3 P = ray->P + result.indirect_t * ray->D;
+
+      /* Record volume segment up to direct scatter position.
+       * TODO: volume segment is wrong when direct_t and indirect_t. */
+      if (result.direct_scatter && (result.direct_t == result.indirect_t)) {
+        guiding_record_volume_segment(kg, state, P, sd.I);
+        guiding_generated_new_segment = true;
+      }
+
+#    if PATH_GUIDING_LEVEL >= 4
+      /* TODO: this position will be wrong for direct light pdf computation,
+       * since the direct light position may be different? */
+      volume_shader_prepare_guiding(
+          kg, state, &sd, &rng_state, P, ray->D, &result.direct_phases, direct_sample_method);
+#    endif
+    }
+    else {
+      /* No guiding if we don't scatter. */
+      state->guiding.use_volume_guiding = false;
+    }
+  }
+#  endif
+
   /* Direct light. */
   if (result.direct_scatter) {
     const float3 direct_P = ray->P + result.direct_t * ray->D;
-    result.direct_throughput /= probability;
+    result.direct_throughput /= continuation_probability;
     integrate_volume_direct_light(kg,
                                   state,
                                   &sd,
@@ -989,16 +1066,22 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
 
   /* Indirect light.
    *
-   * Only divide throughput by probability if we scatter. For the attenuation
+   * Only divide throughput by continuation_probability if we scatter. For the attenuation
    * case the next surface will already do this division. */
   if (result.indirect_scatter) {
-    result.indirect_throughput /= probability;
+    result.indirect_throughput /= continuation_probability;
   }
   INTEGRATOR_STATE_WRITE(state, path, throughput) = result.indirect_throughput;
 
   if (result.indirect_scatter) {
     sd.P = ray->P + result.indirect_t * ray->D;
 
+#  if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 1
+    if (!guiding_generated_new_segment) {
+      guiding_record_volume_segment(kg, state, sd.P, sd.I);
+    }
+#  endif
+
     if (integrate_volume_phase_scatter(kg, state, &sd, &rng_state, &result.indirect_phases)) {
       return VOLUME_PATH_SCATTERED;
     }
diff --git a/intern/cycles/kernel/integrator/shader_eval.h b/intern/cycles/kernel/integrator/shader_eval.h
deleted file mode 100644
index ed4d973e864..00000000000
--- a/intern/cycles/kernel/integrator/shader_eval.h
+++ /dev/null
@@ -1,952 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-/* Functions to evaluate shaders and use the resulting shader closures. */
-
-#pragma once
-
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/bsdf.h"
-#include "kernel/closure/bsdf_util.h"
-#include "kernel/closure/emissive.h"
-
-#include "kernel/film/accumulate.h"
-
-#include "kernel/svm/svm.h"
-
-#ifdef __OSL__
-#  include "kernel/osl/shader.h"
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* Merging */
-
-#if defined(__VOLUME__)
-ccl_device_inline void shader_merge_volume_closures(ccl_private ShaderData *sd)
-{
-  /* Merge identical closures to save closure space with stacked volumes. */
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private ShaderClosure *sci = &sd->closure[i];
-
-    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
-      continue;
-    }
-
-    for (int j = i + 1; j < sd->num_closure; j++) {
-      ccl_private ShaderClosure *scj = &sd->closure[j];
-      if (sci->type != scj->type) {
-        continue;
-      }
-
-      ccl_private const HenyeyGreensteinVolume *hgi = (ccl_private const HenyeyGreensteinVolume *)
-          sci;
-      ccl_private const HenyeyGreensteinVolume *hgj = (ccl_private const HenyeyGreensteinVolume *)
-          scj;
-      if (!(hgi->g == hgj->g)) {
-        continue;
-      }
-
-      sci->weight += scj->weight;
-      sci->sample_weight += scj->sample_weight;
-
-      int size = sd->num_closure - (j + 1);
-      if (size > 0) {
-        for (int k = 0; k < size; k++) {
-          scj[k] = scj[k + 1];
-        }
-      }
-
-      sd->num_closure--;
-      kernel_assert(sd->num_closure >= 0);
-      j--;
-    }
-  }
-}
-
-ccl_device_inline void shader_copy_volume_phases(ccl_private ShaderVolumePhases *ccl_restrict
-                                                     phases,
-                                                 ccl_private const ShaderData *ccl_restrict sd)
-{
-  phases->num_closure = 0;
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *from_sc = &sd->closure[i];
-    ccl_private const HenyeyGreensteinVolume *from_hg =
-        (ccl_private const HenyeyGreensteinVolume *)from_sc;
-
-    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
-      ccl_private ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
-
-      to_sc->weight = from_sc->weight;
-      to_sc->sample_weight = from_sc->sample_weight;
-      to_sc->g = from_hg->g;
-      phases->num_closure++;
-      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
-        break;
-      }
-    }
-  }
-}
-#endif /* __VOLUME__ */
-
-ccl_device_inline void shader_prepare_surface_closures(KernelGlobals kg,
-                                                       ConstIntegratorState state,
-                                                       ccl_private ShaderData *sd,
-                                                       const uint32_t path_flag)
-{
-  /* Filter out closures. */
-  if (kernel_data.integrator.filter_closures) {
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_EMISSION) {
-      sd->closure_emission_background = zero_float3();
-    }
-
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIRECT_LIGHT) {
-      sd->flag &= ~SD_BSDF_HAS_EVAL;
-    }
-
-    if (path_flag & PATH_RAY_CAMERA) {
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-
-        if ((CLOSURE_IS_BSDF_DIFFUSE(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE)) ||
-            (CLOSURE_IS_BSDF_GLOSSY(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY)) ||
-            (CLOSURE_IS_BSDF_TRANSMISSION(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION))) {
-          sc->type = CLOSURE_NONE_ID;
-          sc->sample_weight = 0.0f;
-        }
-        else if ((CLOSURE_IS_BSDF_TRANSPARENT(sc->type) &&
-                  (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSPARENT))) {
-          sc->type = CLOSURE_HOLDOUT_ID;
-          sc->sample_weight = 0.0f;
-          sd->flag |= SD_HOLDOUT;
-        }
-      }
-    }
-  }
-
-  /* Defensive sampling.
-   *
-   * We can likely also do defensive sampling at deeper bounces, particularly
-   * for cases like a perfect mirror but possibly also others. This will need
-   * a good heuristic. */
-  if (INTEGRATOR_STATE(state, path, bounce) + INTEGRATOR_STATE(state, path, transparent_bounce) ==
-          0 &&
-      sd->num_closure > 1) {
-    float sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private ShaderClosure *sc = &sd->closure[i];
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sum += sc->sample_weight;
-      }
-    }
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private ShaderClosure *sc = &sd->closure[i];
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
-      }
-    }
-  }
-
-  /* Filter glossy.
-   *
-   * Blurring of bsdf after bounces, for rays that have a small likelihood
-   * of following this particular path (diffuse, rough glossy) */
-  if (kernel_data.integrator.filter_glossy != FLT_MAX
-#ifdef __MNEE__
-      && !(INTEGRATOR_STATE(state, path, mnee) & PATH_MNEE_VALID)
-#endif
-  ) {
-    float blur_pdf = kernel_data.integrator.filter_glossy *
-                     INTEGRATOR_STATE(state, path, min_ray_pdf);
-
-    if (blur_pdf < 1.0f) {
-      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
-
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-        if (CLOSURE_IS_BSDF(sc->type)) {
-          bsdf_blur(kg, sc, blur_roughness);
-        }
-      }
-    }
-  }
-}
-
-/* BSDF */
-
-ccl_device_inline bool shader_bsdf_is_transmission(ccl_private const ShaderData *sd,
-                                                   const float3 omega_in)
-{
-  return dot(sd->N, omega_in) < 0.0f;
-}
-
-ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
-{
-  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
-    return false;
-  }
-  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type)) {
-      return true;
-    }
-  }
-  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
-    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
-      return true;
-    }
-  }
-  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
-    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-ccl_device_inline float _shader_bsdf_multi_eval(KernelGlobals kg,
-                                                ccl_private ShaderData *sd,
-                                                const float3 omega_in,
-                                                const bool is_transmission,
-                                                ccl_private const ShaderClosure *skip_sc,
-                                                ccl_private BsdfEval *result_eval,
-                                                float sum_pdf,
-                                                float sum_sample_weight,
-                                                const uint light_shader_flags)
-{
-  /* This is the veach one-sample model with balance heuristic,
-   * some PDF factors drop out when using balance heuristic weighting. */
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (sc == skip_sc) {
-      continue;
-    }
-
-    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-      if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
-        float bsdf_pdf = 0.0f;
-        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
-
-        if (bsdf_pdf != 0.0f) {
-          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
-          sum_pdf += bsdf_pdf * sc->sample_weight;
-        }
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-
-  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_inline
-#endif
-    float
-    shader_bsdf_eval(KernelGlobals kg,
-                     ccl_private ShaderData *sd,
-                     const float3 omega_in,
-                     const bool is_transmission,
-                     ccl_private BsdfEval *bsdf_eval,
-                     const uint light_shader_flags)
-{
-  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, zero_float3());
-
-  return _shader_bsdf_multi_eval(
-      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
-}
-
-/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
-ccl_device_inline ccl_private const ShaderClosure *shader_bsdf_bssrdf_pick(
-    ccl_private const ShaderData *ccl_restrict sd, ccl_private float *randu)
-{
-  int sampled = 0;
-
-  if (sd->num_closure > 1) {
-    /* Pick a BSDF or based on sample weights. */
-    float sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sum += sc->sample_weight;
-      }
-    }
-
-    float r = (*randu) * sum;
-    float partial_sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r < next_sum) {
-          sampled = i;
-
-          /* Rescale to reuse for direction sample, to better preserve stratification. */
-          *randu = (r - partial_sum) / sc->sample_weight;
-          break;
-        }
-
-        partial_sum = next_sum;
-      }
-    }
-  }
-
-  return &sd->closure[sampled];
-}
-
-/* Return weight for picked BSSRDF. */
-ccl_device_inline float3
-shader_bssrdf_sample_weight(ccl_private const ShaderData *ccl_restrict sd,
-                            ccl_private const ShaderClosure *ccl_restrict bssrdf_sc)
-{
-  float3 weight = bssrdf_sc->weight;
-
-  if (sd->num_closure > 1) {
-    float sum = 0.0f;
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sum += sc->sample_weight;
-      }
-    }
-    weight *= sum / bssrdf_sc->sample_weight;
-  }
-
-  return weight;
-}
-
-/* Sample direction for picked BSDF, and return evaluation and pdf for all
- * BSDFs combined using MIS. */
-ccl_device int shader_bsdf_sample_closure(KernelGlobals kg,
-                                          ccl_private ShaderData *sd,
-                                          ccl_private const ShaderClosure *sc,
-                                          float randu,
-                                          float randv,
-                                          ccl_private BsdfEval *bsdf_eval,
-                                          ccl_private float3 *omega_in,
-                                          ccl_private differential3 *domega_in,
-                                          ccl_private float *pdf)
-{
-  /* BSSRDF should already have been handled elsewhere. */
-  kernel_assert(CLOSURE_IS_BSDF(sc->type));
-
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);
-
-    if (sd->num_closure > 1) {
-      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
-      float sweight = sc->sample_weight;
-      *pdf = _shader_bsdf_multi_eval(
-          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
-    }
-  }
-
-  return label;
-}
-
-ccl_device float shader_bsdf_average_roughness(ccl_private const ShaderData *sd)
-{
-  float roughness = 0.0f;
-  float sum_weight = 0.0f;
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF(sc->type)) {
-      /* sqrt once to undo the squaring from multiplying roughness on the
-       * two axes, and once for the squared roughness convention. */
-      float weight = fabsf(average(sc->weight));
-      roughness += weight * sqrtf(safe_sqrtf(bsdf_get_roughness_squared(sc)));
-      sum_weight += weight;
-    }
-  }
-
-  return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
-}
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  if (sd->flag & SD_HAS_ONLY_VOLUME) {
-    return one_float3();
-  }
-  else if (sd->flag & SD_TRANSPARENT) {
-    return sd->closure_transparent_extinction;
-  }
-  else {
-    return zero_float3();
-  }
-}
-
-ccl_device void shader_bsdf_disable_transparency(KernelGlobals kg, ccl_private ShaderData *sd)
-{
-  if (sd->flag & SD_TRANSPARENT) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private ShaderClosure *sc = &sd->closure[i];
-
-      if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
-        sc->sample_weight = 0.0f;
-        sc->weight = zero_float3();
-      }
-    }
-
-    sd->flag &= ~SD_TRANSPARENT;
-  }
-}
-
-ccl_device float3 shader_bsdf_alpha(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
-
-  alpha = max(alpha, zero_float3());
-  alpha = min(alpha, one_float3());
-
-  return alpha;
-}
-
-ccl_device float3 shader_bsdf_diffuse(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 eval = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
-ccl_device float3 shader_bsdf_glossy(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 eval = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
-ccl_device float3 shader_bsdf_transmission(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 eval = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
-ccl_device float3 shader_bsdf_average_normal(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 N = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
-      N += sc->N * fabsf(average(sc->weight));
-  }
-
-  return (is_zero(N)) ? sd->N : normalize(N);
-}
-
-ccl_device float3 shader_bsdf_ao(KernelGlobals kg,
-                                 ccl_private const ShaderData *sd,
-                                 const float ao_factor,
-                                 ccl_private float3 *N_)
-{
-  float3 eval = zero_float3();
-  float3 N = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
-      ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
-      eval += sc->weight * ao_factor;
-      N += bsdf->N * fabsf(average(sc->weight));
-    }
-  }
-
-  *N_ = (is_zero(N)) ? sd->N : normalize(N);
-  return eval;
-}
-
-#ifdef __SUBSURFACE__
-ccl_device float3 shader_bssrdf_normal(ccl_private const ShaderData *sd)
-{
-  float3 N = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSSRDF(sc->type)) {
-      ccl_private const Bssrdf *bssrdf = (ccl_private const Bssrdf *)sc;
-      float avg_weight = fabsf(average(sc->weight));
-
-      N += bssrdf->N * avg_weight;
-    }
-  }
-
-  return (is_zero(N)) ? sd->N : normalize(N);
-}
-#endif /* __SUBSURFACE__ */
-
-/* Constant emission optimization */
-
-ccl_device bool shader_constant_emission_eval(KernelGlobals kg,
-                                              int shader,
-                                              ccl_private float3 *eval)
-{
-  int shader_index = shader & SHADER_MASK;
-  int shader_flag = kernel_data_fetch(shaders, shader_index).flags;
-
-  if (shader_flag & SD_HAS_CONSTANT_EMISSION) {
-    *eval = make_float3(kernel_data_fetch(shaders, shader_index).constant_emission[0],
-                        kernel_data_fetch(shaders, shader_index).constant_emission[1],
-                        kernel_data_fetch(shaders, shader_index).constant_emission[2]);
-
-    return true;
-  }
-
-  return false;
-}
-
-/* Background */
-
-ccl_device float3 shader_background_eval(ccl_private const ShaderData *sd)
-{
-  if (sd->flag & SD_EMISSION) {
-    return sd->closure_emission_background;
-  }
-  else {
-    return zero_float3();
-  }
-}
-
-/* Emission */
-
-ccl_device float3 shader_emissive_eval(ccl_private const ShaderData *sd)
-{
-  if (sd->flag & SD_EMISSION) {
-    return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
-  }
-  else {
-    return zero_float3();
-  }
-}
-
-/* Holdout */
-
-ccl_device float3 shader_holdout_apply(KernelGlobals kg, ccl_private ShaderData *sd)
-{
-  float3 weight = zero_float3();
-
-  /* For objects marked as holdout, preserve transparency and remove all other
-   * closures, replacing them with a holdout weight. */
-  if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-    if ((sd->flag & SD_TRANSPARENT) && !(sd->flag & SD_HAS_ONLY_VOLUME)) {
-      weight = one_float3() - sd->closure_transparent_extinction;
-
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-        if (!CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-          sc->type = NBUILTIN_CLOSURES;
-        }
-      }
-
-      sd->flag &= ~(SD_CLOSURE_FLAGS - (SD_TRANSPARENT | SD_BSDF));
-    }
-    else {
-      weight = one_float3();
-    }
-  }
-  else {
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-      if (CLOSURE_IS_HOLDOUT(sc->type)) {
-        weight += sc->weight;
-      }
-    }
-  }
-
-  return weight;
-}
-
-/* Surface Evaluation */
-
-template<uint node_feature_mask, typename ConstIntegratorGenericState>
-ccl_device void shader_eval_surface(KernelGlobals kg,
-                                    ConstIntegratorGenericState state,
-                                    ccl_private ShaderData *ccl_restrict sd,
-                                    ccl_global float *ccl_restrict buffer,
-                                    uint32_t path_flag,
-                                    bool use_caustics_storage = false)
-{
-  /* If path is being terminated, we are tracing a shadow ray or evaluating
-   * emission, then we don't need to store closures. The emission and shadow
-   * shader data also do not have a closure array to save GPU memory. */
-  int max_closures;
-  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
-    max_closures = 0;
-  }
-  else {
-    max_closures = use_caustics_storage ? CAUSTICS_MAX_CLOSURE : kernel_data.max_closures;
-  }
-
-  sd->num_closure = 0;
-  sd->num_closure_left = max_closures;
-
-#ifdef __OSL__
-  if (kg->osl) {
-    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, state, sd, path_flag);
-    }
-    else {
-      OSLShader::eval_surface(kg, state, sd, path_flag);
-    }
-  }
-  else
-#endif
-  {
-#ifdef __SVM__
-    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(kg, state, sd, buffer, path_flag);
-#else
-    if (sd->object == OBJECT_NONE) {
-      sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
-      sd->flag |= SD_EMISSION;
-    }
-    else {
-      ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
-          sd, sizeof(DiffuseBsdf), make_float3(0.8f, 0.8f, 0.8f));
-      if (bsdf != NULL) {
-        bsdf->N = sd->N;
-        sd->flag |= bsdf_diffuse_setup(bsdf);
-      }
-    }
-#endif
-  }
-}
-
-/* Volume */
-
-#ifdef __VOLUME__
-
-ccl_device_inline float _shader_volume_phase_multi_eval(
-    ccl_private const ShaderData *sd,
-    ccl_private const ShaderVolumePhases *phases,
-    const float3 omega_in,
-    int skip_phase,
-    ccl_private BsdfEval *result_eval,
-    float sum_pdf,
-    float sum_sample_weight)
-{
-  for (int i = 0; i < phases->num_closure; i++) {
-    if (i == skip_phase)
-      continue;
-
-    ccl_private const ShaderVolumeClosure *svc = &phases->closure[i];
-    float phase_pdf = 0.0f;
-    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
-
-    if (phase_pdf != 0.0f) {
-      bsdf_eval_accum(result_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
-      sum_pdf += phase_pdf * svc->sample_weight;
-    }
-
-    sum_sample_weight += svc->sample_weight;
-  }
-
-  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-ccl_device float shader_volume_phase_eval(KernelGlobals kg,
-                                          ccl_private const ShaderData *sd,
-                                          ccl_private const ShaderVolumePhases *phases,
-                                          const float3 omega_in,
-                                          ccl_private BsdfEval *phase_eval)
-{
-  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_float3());
-
-  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
-}
-
-ccl_device int shader_volume_phase_sample(KernelGlobals kg,
-                                          ccl_private const ShaderData *sd,
-                                          ccl_private const ShaderVolumePhases *phases,
-                                          float randu,
-                                          float randv,
-                                          ccl_private BsdfEval *phase_eval,
-                                          ccl_private float3 *omega_in,
-                                          ccl_private differential3 *domega_in,
-                                          ccl_private float *pdf)
-{
-  int sampled = 0;
-
-  if (phases->num_closure > 1) {
-    /* pick a phase closure based on sample weights */
-    float sum = 0.0f;
-
-    for (sampled = 0; sampled < phases->num_closure; sampled++) {
-      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
-      sum += svc->sample_weight;
-    }
-
-    float r = randu * sum;
-    float partial_sum = 0.0f;
-
-    for (sampled = 0; sampled < phases->num_closure; sampled++) {
-      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
-      float next_sum = partial_sum + svc->sample_weight;
-
-      if (r <= next_sum) {
-        /* Rescale to reuse for BSDF direction sample. */
-        randu = (r - partial_sum) / svc->sample_weight;
-        break;
-      }
-
-      partial_sum = next_sum;
-    }
-
-    if (sampled == phases->num_closure) {
-      *pdf = 0.0f;
-      return LABEL_NONE;
-    }
-  }
-
-  /* todo: this isn't quite correct, we don't weight anisotropy properly
-   * depending on color channels, even if this is perhaps not a common case */
-  ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
-  }
-
-  return label;
-}
-
-ccl_device int shader_phase_sample_closure(KernelGlobals kg,
-                                           ccl_private const ShaderData *sd,
-                                           ccl_private const ShaderVolumeClosure *sc,
-                                           float randu,
-                                           float randv,
-                                           ccl_private BsdfEval *phase_eval,
-                                           ccl_private float3 *omega_in,
-                                           ccl_private differential3 *domega_in,
-                                           ccl_private float *pdf)
-{
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
-
-  return label;
-}
-
-/* Volume Evaluation */
-
-template<const bool shadow, typename StackReadOp, typename ConstIntegratorGenericState>
-ccl_device_inline void shader_eval_volume(KernelGlobals kg,
-                                          ConstIntegratorGenericState state,
-                                          ccl_private ShaderData *ccl_restrict sd,
-                                          const uint32_t path_flag,
-                                          StackReadOp stack_read)
-{
-  /* If path is being terminated, we are tracing a shadow ray or evaluating
-   * emission, then we don't need to store closures. The emission and shadow
-   * shader data also do not have a closure array to save GPU memory. */
-  int max_closures;
-  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
-    max_closures = 0;
-  }
-  else {
-    max_closures = kernel_data.max_closures;
-  }
-
-  /* reset closures once at the start, we will be accumulating the closures
-   * for all volumes in the stack into a single array of closures */
-  sd->num_closure = 0;
-  sd->num_closure_left = max_closures;
-  sd->flag = 0;
-  sd->object_flag = 0;
-
-  for (int i = 0;; i++) {
-    const VolumeStack entry = stack_read(i);
-    if (entry.shader == SHADER_NONE) {
-      break;
-    }
-
-    /* Setup shader-data from stack. it's mostly setup already in
-     * shader_setup_from_volume, this switching should be quick. */
-    sd->object = entry.object;
-    sd->lamp = LAMP_NONE;
-    sd->shader = entry.shader;
-
-    sd->flag &= ~SD_SHADER_FLAGS;
-    sd->flag |= kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).flags;
-    sd->object_flag &= ~SD_OBJECT_FLAGS;
-
-    if (sd->object != OBJECT_NONE) {
-      sd->object_flag |= kernel_data_fetch(object_flag, sd->object);
-
-#  ifdef __OBJECT_MOTION__
-      /* todo: this is inefficient for motion blur, we should be
-       * caching matrices instead of recomputing them each step */
-      shader_setup_object_transforms(kg, sd, sd->time);
-
-      if ((sd->object_flag & SD_OBJECT_HAS_VOLUME_MOTION) != 0) {
-        AttributeDescriptor v_desc = find_attribute(kg, sd, ATTR_STD_VOLUME_VELOCITY);
-        kernel_assert(v_desc.offset != ATTR_STD_NOT_FOUND);
-
-        const float3 P = sd->P;
-        const float velocity_scale = kernel_data_fetch(objects, sd->object).velocity_scale;
-        const float time_offset = kernel_data.cam.motion_position == MOTION_POSITION_CENTER ?
-                                      0.5f :
-                                      0.0f;
-        const float time = kernel_data.cam.motion_position == MOTION_POSITION_END ?
-                               (1.0f - kernel_data.cam.shuttertime) + sd->time :
-                               sd->time;
-
-        /* Use a 1st order semi-lagrangian advection scheme to estimate what volume quantity
-         * existed, or will exist, at the given time:
-         *
-         * `phi(x, T) = phi(x - (T - t) * u(x, T), t)`
-         *
-         * where
-         *
-         * x : position
-         * T : super-sampled time (or ray time)
-         * t : current time of the simulation (in rendering we assume this is center frame with
-         * relative time = 0)
-         * phi : the volume quantity
-         * u : the velocity field
-         *
-         * But first we need to determine the velocity field `u(x, T)`, which we can estimate also
-         * using semi-lagrangian advection.
-         *
-         * `u(x, T) = u(x - (T - t) * u(x, T), t)`
-         *
-         * This is the typical way to model self-advection in fluid dynamics, however, we do not
-         * account for other forces affecting the velocity during simulation (pressure, buoyancy,
-         * etc.): this gives a linear interpolation when fluid are mostly "curvy". For better
-         * results, a higher order interpolation scheme can be used (at the cost of more lookups),
-         * or an interpolation of the velocity fields for the previous and next frames could also
-         * be used to estimate `u(x, T)` (which will cost more memory and lookups).
-         *
-         * References:
-         * "Eulerian Motion Blur", Kim and Ko, 2007
-         * "Production Volume Rendering", Wreninge et al., 2012
-         */
-
-        /* Find velocity. */
-        float3 velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
-        object_dir_transform(kg, sd, &velocity);
-
-        /* Find advected P. */
-        sd->P = P - (time - time_offset) * velocity_scale * velocity;
-
-        /* Find advected velocity. */
-        velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
-        object_dir_transform(kg, sd, &velocity);
-
-        /* Find advected P. */
-        sd->P = P - (time - time_offset) * velocity_scale * velocity;
-      }
-#  endif
-    }
-
-    /* evaluate shader */
-#  ifdef __SVM__
-#    ifdef __OSL__
-    if (kg->osl) {
-      OSLShader::eval_volume(kg, state, sd, path_flag);
-    }
-    else
-#    endif
-    {
-      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
-          kg, state, sd, NULL, path_flag);
-    }
-#  endif
-
-    /* Merge closures to avoid exceeding number of closures limit. */
-    if (!shadow) {
-      if (i > 0) {
-        shader_merge_volume_closures(sd);
-      }
-    }
-  }
-}
-
-#endif /* __VOLUME__ */
-
-/* Displacement Evaluation */
-
-template<typename ConstIntegratorGenericState>
-ccl_device void shader_eval_displacement(KernelGlobals kg,
-                                         ConstIntegratorGenericState state,
-                                         ccl_private ShaderData *sd)
-{
-  sd->num_closure = 0;
-  sd->num_closure_left = 0;
-
-  /* this will modify sd->P */
-#ifdef __SVM__
-#  ifdef __OSL__
-  if (kg->osl)
-    OSLShader::eval_displacement(kg, state, sd);
-  else
-#  endif
-  {
-    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
-        kg, state, sd, NULL, 0);
-  }
-#endif
-}
-
-/* Cryptomatte */
-
-ccl_device float shader_cryptomatte_id(KernelGlobals kg, int shader)
-{
-  return kernel_data_fetch(shaders, (shader & SHADER_MASK)).cryptomatte_id;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/shadow_catcher.h b/intern/cycles/kernel/integrator/shadow_catcher.h
index ff63625aceb..a620853faea 100644
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include "kernel/film/write_passes.h"
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/state_util.h"
 
@@ -76,28 +75,6 @@ ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(const uint32_t
   return path_flag & PATH_RAY_SHADOW_CATCHER_PASS;
 }
 
-/* Write shadow catcher passes on a bounce from the shadow catcher object. */
-ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
-    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
-{
-  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
-  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
-
-  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                        kernel_data.film.pass_stride;
-  ccl_global float *buffer = render_buffer + render_buffer_offset;
-
-  /* Count sample for the shadow catcher object. */
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
-
-  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
-   * transparency to the matte. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
-                          average(throughput));
-}
-
 #endif /* __SHADOW_CATCHER__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h
index c340467606d..d731d1df339 100644
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -27,19 +27,29 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T
 /* enum PathRayFlag */
 KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput for shadow pass. */
 KERNEL_STRUCT_MEMBER(shadow_path,
-                     packed_float3,
+                     PackedSpectrum,
                      unshadowed_throughput,
                      KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 /* Light group. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint8_t, lightgroup, KERNEL_FEATURE_PATH_TRACING)
+/* Path guiding. */
+KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, unlit_throughput, KERNEL_FEATURE_PATH_GUIDING)
+#ifdef __PATH_GUIDING__
+KERNEL_STRUCT_MEMBER(shadow_path,
+                     openpgl::cpp::PathSegment *,
+                     path_segment,
+                     KERNEL_FEATURE_PATH_GUIDING)
+#else
+KERNEL_STRUCT_MEMBER(shadow_path, uint64_t, path_segment, KERNEL_FEATURE_PATH_GUIDING)
+#endif
 KERNEL_STRUCT_END(shadow_path)
 
 /********************************** Shadow Ray *******************************/
diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h
index d10d31e930e..f0fdc6f0d54 100644
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -31,6 +31,10 @@
 
 #include "util/types.h"
 
+#ifdef __PATH_GUIDING__
+#  include "util/guiding.h"
+#endif
+
 #pragma once
 
 CCL_NAMESPACE_BEGIN
@@ -140,7 +144,7 @@ typedef struct IntegratorStateGPU {
  * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
  * from a kernel which operates on a shadow catcher state will cause bad memory access. */
 
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 
 /* Scalar access on CPU. */
 
@@ -159,7 +163,7 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;
 #  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
     ((state)->nested_struct[array_index].member)
 
-#else /* __KERNEL_CPU__ */
+#else /* !__KERNEL_GPU__ */
 
 /* Array access on GPU with Structure-of-Arrays. */
 
@@ -180,6 +184,6 @@ typedef int ConstIntegratorShadowState;
 #  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
     INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member)
 
-#endif /* __KERNEL_CPU__ */
+#endif /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index 4b03c665e17..40961b1c5fb 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -76,6 +76,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
       &kernel_integrator_state.next_shadow_path_index[0], 1);
   atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+#  ifdef __PATH_GUIDING__
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr;
+#  endif
   return shadow_state;
 }
 
@@ -181,6 +184,9 @@ ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
 {
   IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+#  ifdef __PATH_GUIDING__
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, path_segment) = nullptr;
+#  endif
   return shadow_state;
 }
 
diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h
index 5c2af131945..610621f0abe 100644
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -46,12 +46,15 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 /* Continuation probability for path termination. */
 KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Factor to multiple with throughput to get remove any guiding PDFS.
+ * Such throughput without guiding PDFS is used for Russian roulette termination. */
+KERNEL_STRUCT_MEMBER(path, float, unguided_throughput, KERNEL_FEATURE_PATH_GUIDING)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
-KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
 /* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
 KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
@@ -84,8 +87,8 @@ KERNEL_STRUCT_END(isect)
 /*************** Subsurface closure state for subsurface kernel ***************/
 
 KERNEL_STRUCT_BEGIN(subsurface)
-KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, radius, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_END(subsurface)
@@ -98,3 +101,33 @@ KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_END_ARRAY(volume_stack,
                         KERNEL_STRUCT_VOLUME_STACK_SIZE,
                         KERNEL_STRUCT_VOLUME_STACK_SIZE)
+
+/************************************ Path Guiding *****************************/
+KERNEL_STRUCT_BEGIN(guiding)
+#ifdef __PATH_GUIDING__
+/* Current path segment of the random walk/path. */
+KERNEL_STRUCT_MEMBER(guiding,
+                     openpgl::cpp::PathSegment *,
+                     path_segment,
+                     KERNEL_FEATURE_PATH_GUIDING)
+#else
+/* Current path segment of the random walk/path. */
+KERNEL_STRUCT_MEMBER(guiding, uint64_t, path_segment, KERNEL_FEATURE_PATH_GUIDING)
+#endif
+/* If surface guiding is enabled */
+KERNEL_STRUCT_MEMBER(guiding, bool, use_surface_guiding, KERNEL_FEATURE_PATH_GUIDING)
+/* Random number used for additional guiding decisions (e.g., cache query, selection to use guiding
+ * or BSDF sampling) */
+KERNEL_STRUCT_MEMBER(guiding, float, sample_surface_guiding_rand, KERNEL_FEATURE_PATH_GUIDING)
+/* The probability to use surface guiding (i.e., diffuse sampling prob * guiding prob)*/
+KERNEL_STRUCT_MEMBER(guiding, float, surface_guiding_sampling_prob, KERNEL_FEATURE_PATH_GUIDING)
+/* Probability of sampling a BSSRDF closure instead of a BSDF closure*/
+KERNEL_STRUCT_MEMBER(guiding, float, bssrdf_sampling_prob, KERNEL_FEATURE_PATH_GUIDING)
+/* If volume guiding is enabled */
+KERNEL_STRUCT_MEMBER(guiding, bool, use_volume_guiding, KERNEL_FEATURE_PATH_GUIDING)
+/* Random number used for additional guiding decisions (e.g., cache query, selection to use guiding
+ * or BSDF sampling) */
+KERNEL_STRUCT_MEMBER(guiding, float, sample_volume_guiding_rand, KERNEL_FEATURE_PATH_GUIDING)
+/* The probability to use surface guiding (i.e., diffuse sampling prob * guiding prob). */
+KERNEL_STRUCT_MEMBER(guiding, float, volume_guiding_sampling_prob, KERNEL_FEATURE_PATH_GUIDING)
+KERNEL_STRUCT_END(guiding)
diff --git a/intern/cycles/kernel/integrator/state_util.h b/intern/cycles/kernel/integrator/state_util.h
index 8dd58ad6bcd..168122d3a78 100644
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -338,7 +338,7 @@ ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGl
   return to_state;
 }
 
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 ccl_device_inline int integrator_state_bounce(ConstIntegratorState state, const int)
 {
   return INTEGRATOR_STATE(state, path, bounce);
diff --git a/intern/cycles/kernel/integrator/subsurface.h b/intern/cycles/kernel/integrator/subsurface.h
index 2f96f215d8a..efd293e4141 100644
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -15,9 +15,9 @@
 
 #include "kernel/integrator/intersect_volume_stack.h"
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
 #include "kernel/integrator/subsurface_disk.h"
 #include "kernel/integrator/subsurface_random_walk.h"
+#include "kernel/integrator/surface_shader.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -51,12 +51,10 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
                                                                  PATH_RAY_SUBSURFACE_RANDOM_WALK);
 
   /* Compute weight, optionally including Fresnel from entry point. */
-  float3 weight = shader_bssrdf_sample_weight(sd, sc);
-#  ifdef __PRINCIPLED__
+  Spectrum weight = surface_shader_bssrdf_sample_weight(sd, sc);
   if (bssrdf->roughness != FLT_MAX) {
     path_flag |= PATH_RAY_SUBSURFACE_USE_FRESNEL;
   }
-#  endif
 
   if (sd->flag & SD_BACKFACING) {
     path_flag |= PATH_RAY_SUBSURFACE_BACKFACING;
@@ -70,8 +68,8 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
     if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_spectrum();
+      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_spectrum();
     }
   }
 
@@ -80,6 +78,9 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, subsurface, radius) = bssrdf->radius;
   INTEGRATOR_STATE_WRITE(state, subsurface, anisotropy) = bssrdf->anisotropy;
 
+  /* Path guiding. */
+  guiding_record_bssrdf_weight(kg, state, weight, bssrdf->albedo);
+
   return LABEL_SUBSURFACE_SCATTER;
 }
 
@@ -91,7 +92,7 @@ ccl_device void subsurface_shader_data_setup(KernelGlobals kg,
   /* Get bump mapped normal from shader evaluation at exit point. */
   float3 N = sd->N;
   if (sd->flag & SD_HAS_BSSRDF_BUMP) {
-    N = shader_bssrdf_normal(sd);
+    N = surface_shader_bssrdf_normal(sd);
   }
 
   /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
@@ -99,9 +100,8 @@ ccl_device void subsurface_shader_data_setup(KernelGlobals kg,
   sd->num_closure = 0;
   sd->num_closure_left = kernel_data.max_closures;
 
-  const float3 weight = one_float3();
+  const Spectrum weight = one_spectrum();
 
-#  ifdef __PRINCIPLED__
   if (path_flag & PATH_RAY_SUBSURFACE_USE_FRESNEL) {
     ccl_private PrincipledDiffuseBsdf *bsdf = (ccl_private PrincipledDiffuseBsdf *)bsdf_alloc(
         sd, sizeof(PrincipledDiffuseBsdf), weight);
@@ -112,9 +112,7 @@ ccl_device void subsurface_shader_data_setup(KernelGlobals kg,
       sd->flag |= bsdf_principled_diffuse_setup(bsdf, PRINCIPLED_DIFFUSE_LAMBERT_EXIT);
     }
   }
-  else
-#  endif /* __PRINCIPLED__ */
-  {
+  else {
     ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
         sd, sizeof(DiffuseBsdf), weight);
 
diff --git a/intern/cycles/kernel/integrator/subsurface_disk.h b/intern/cycles/kernel/integrator/subsurface_disk.h
index 2836934f6dd..16fb45392f4 100644
--- a/intern/cycles/kernel/integrator/subsurface_disk.h
+++ b/intern/cycles/kernel/integrator/subsurface_disk.h
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
+#include "kernel/integrator/guiding.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* BSSRDF using disk based importance sampling.
@@ -9,11 +11,11 @@ CCL_NAMESPACE_BEGIN
  * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
  */
 
-ccl_device_inline float3 subsurface_disk_eval(const float3 radius, float disk_r, float r)
+ccl_device_inline Spectrum subsurface_disk_eval(const Spectrum radius, float disk_r, float r)
 {
-  const float3 eval = bssrdf_eval(radius, r);
+  const Spectrum eval = bssrdf_eval(radius, r);
   const float pdf = bssrdf_pdf(radius, disk_r);
-  return (pdf > 0.0f) ? eval / pdf : zero_float3();
+  return (pdf > 0.0f) ? eval / pdf : zero_spectrum();
 }
 
 /* Subsurface scattering step, from a point on the surface to other
@@ -25,8 +27,7 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
                                        ccl_private LocalIntersection &ss_isect)
 
 {
-  float disk_u, disk_v;
-  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &disk_u, &disk_v);
+  float2 rand_disk = path_state_rng_2D(kg, &rng_state, PRNG_SUBSURFACE_DISK);
 
   /* Read shading point info from integrator state. */
   const float3 P = INTEGRATOR_STATE(state, ray, P);
@@ -37,7 +38,7 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
 
   /* Read subsurface scattering parameters. */
-  const float3 radius = INTEGRATOR_STATE(state, subsurface, radius);
+  const Spectrum radius = INTEGRATOR_STATE(state, subsurface, radius);
 
   /* Pick random axis in local frame and point on disk. */
   float3 disk_N, disk_T, disk_B;
@@ -46,20 +47,20 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   disk_N = Ng;
   make_orthonormals(disk_N, &disk_T, &disk_B);
 
-  if (disk_v < 0.5f) {
+  if (rand_disk.y < 0.5f) {
     pick_pdf_N = 0.5f;
     pick_pdf_T = 0.25f;
     pick_pdf_B = 0.25f;
-    disk_v *= 2.0f;
+    rand_disk.y *= 2.0f;
   }
-  else if (disk_v < 0.75f) {
+  else if (rand_disk.y < 0.75f) {
     float3 tmp = disk_N;
     disk_N = disk_T;
     disk_T = tmp;
     pick_pdf_N = 0.25f;
     pick_pdf_T = 0.5f;
     pick_pdf_B = 0.25f;
-    disk_v = (disk_v - 0.5f) * 4.0f;
+    rand_disk.y = (rand_disk.y - 0.5f) * 4.0f;
   }
   else {
     float3 tmp = disk_N;
@@ -68,14 +69,14 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
     pick_pdf_N = 0.25f;
     pick_pdf_T = 0.25f;
     pick_pdf_B = 0.5f;
-    disk_v = (disk_v - 0.75f) * 4.0f;
+    rand_disk.y = (rand_disk.y - 0.75f) * 4.0f;
   }
 
   /* Sample point on disk. */
-  float phi = M_2PI_F * disk_v;
+  float phi = M_2PI_F * rand_disk.y;
   float disk_height, disk_r;
 
-  bssrdf_sample(radius, disk_u, &disk_r, &disk_height);
+  bssrdf_sample(radius, rand_disk.x, &disk_r, &disk_height);
 
   float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
 
@@ -108,7 +109,7 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
    * traversal algorithm. */
   sort_intersections_and_normals(ss_isect.hits, ss_isect.Ng, num_eval_hits);
 
-  float3 weights[BSSRDF_MAX_HITS]; /* TODO: zero? */
+  Spectrum weights[BSSRDF_MAX_HITS]; /* TODO: zero? */
   float sum_weights = 0.0f;
 
   for (int hit = 0; hit < num_eval_hits; hit++) {
@@ -126,17 +127,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
     if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
       /* Transform normal to world space. */
       Transform itfm;
-      Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
+      object_fetch_transform_motion_test(kg, object, time, &itfm);
       hit_Ng = normalize(transform_direction_transposed(&itfm, hit_Ng));
-
-      /* Transform t to world space, except for OptiX and MetalRT where it already is. */
-#ifdef __KERNEL_GPU_RAYTRACING__
-      (void)tfm;
-#else
-      float3 D = transform_direction(&itfm, ray.D);
-      D = normalize(D) * ss_isect.hits[hit].t;
-      ss_isect.hits[hit].t = len(transform_direction(&tfm, D));
-#endif
     }
 
     /* Quickly retrieve P and Ng without setting up ShaderData. */
@@ -159,7 +151,7 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
     const float r = len(hit_P - P);
 
     /* Evaluate profiles. */
-    const float3 weight = subsurface_disk_eval(radius, disk_r, r) * w;
+    const Spectrum weight = subsurface_disk_eval(radius, disk_r, r) * w;
 
     /* Store result. */
     ss_isect.Ng[hit] = hit_Ng;
@@ -172,18 +164,19 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   }
 
   /* Use importance resampling, sampling one of the hits proportional to weight. */
-  const float r = lcg_step_float(&lcg_state) * sum_weights;
+  const float rand_resample = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_DISK_RESAMPLE);
+  const float r = rand_resample * sum_weights;
   float partial_sum = 0.0f;
 
   for (int hit = 0; hit < num_eval_hits; hit++) {
-    const float3 weight = weights[hit];
+    const Spectrum weight = weights[hit];
     const float sample_weight = average(fabs(weight));
     float next_sum = partial_sum + sample_weight;
 
     if (r < next_sum) {
       /* Return exit point. */
-      INTEGRATOR_STATE_WRITE(state, path, throughput) *= weight * sum_weights / sample_weight;
-
+      const Spectrum resampled_weight = weight * sum_weights / sample_weight;
+      INTEGRATOR_STATE_WRITE(state, path, throughput) *= resampled_weight;
       ss_isect.hits[0] = ss_isect.hits[hit];
       ss_isect.Ng[0] = ss_isect.Ng[hit];
 
@@ -191,6 +184,9 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
       ray.D = ss_isect.Ng[hit];
       ray.tmin = 0.0f;
       ray.tmax = 1.0f;
+
+      guiding_record_bssrdf_bounce(
+          kg, state, 1.0f, Ng, -Ng, resampled_weight, INTEGRATOR_STATE(state, subsurface, albedo));
       return true;
     }
 
diff --git a/intern/cycles/kernel/integrator/subsurface_random_walk.h b/intern/cycles/kernel/integrator/subsurface_random_walk.h
index c1691030817..fdcb66c32f5 100644
--- a/intern/cycles/kernel/integrator/subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -5,6 +5,8 @@
 
 #include "kernel/bvh/bvh.h"
 
+#include "kernel/integrator/guiding.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Random walk subsurface scattering.
@@ -65,19 +67,20 @@ ccl_device void subsurface_random_walk_remap(const float albedo,
   *sigma_t = sigma_t_prime / (1.0f - g);
 }
 
-ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
-                                                    const float3 radius,
+ccl_device void subsurface_random_walk_coefficients(const Spectrum albedo,
+                                                    const Spectrum radius,
                                                     const float anisotropy,
-                                                    ccl_private float3 *sigma_t,
-                                                    ccl_private float3 *alpha,
-                                                    ccl_private float3 *throughput)
+                                                    ccl_private Spectrum *sigma_t,
+                                                    ccl_private Spectrum *alpha,
+                                                    ccl_private Spectrum *throughput)
 {
-  float sigma_t_x, sigma_t_y, sigma_t_z;
-  float alpha_x, alpha_y, alpha_z;
-
-  subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x);
-  subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y);
-  subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z);
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    subsurface_random_walk_remap(GET_SPECTRUM_CHANNEL(albedo, i),
+                                 GET_SPECTRUM_CHANNEL(radius, i),
+                                 anisotropy,
+                                 &GET_SPECTRUM_CHANNEL(*sigma_t, i),
+                                 &GET_SPECTRUM_CHANNEL(*alpha, i));
+  }
 
   /* Throughput already contains closure weight at this point, which includes the
    * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo
@@ -88,21 +91,12 @@ ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
    * infinite phase functions. To avoid a sharp discontinuity as we go from
    * such values to 0.0, increase alpha and reduce the throughput to compensate. */
   const float min_alpha = 0.2f;
-  if (alpha_x < min_alpha) {
-    (*throughput).x *= alpha_x / min_alpha;
-    alpha_x = min_alpha;
-  }
-  if (alpha_y < min_alpha) {
-    (*throughput).y *= alpha_y / min_alpha;
-    alpha_y = min_alpha;
-  }
-  if (alpha_z < min_alpha) {
-    (*throughput).z *= alpha_z / min_alpha;
-    alpha_z = min_alpha;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    if (GET_SPECTRUM_CHANNEL(*alpha, i) < min_alpha) {
+      GET_SPECTRUM_CHANNEL(*throughput, i) *= GET_SPECTRUM_CHANNEL(*alpha, i) / min_alpha;
+      GET_SPECTRUM_CHANNEL(*alpha, i) = min_alpha;
+    }
   }
-
-  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
-  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
 }
 
 /* References for Dwivedi sampling:
@@ -151,12 +145,12 @@ ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, f
   return dir.x * T + dir.y * B + dir.z * D;
 }
 
-ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
-                                                         float t,
-                                                         bool hit,
-                                                         ccl_private float3 *transmittance)
+ccl_device_forceinline Spectrum subsurface_random_walk_pdf(Spectrum sigma_t,
+                                                           float t,
+                                                           bool hit,
+                                                           ccl_private Spectrum *transmittance)
 {
-  float3 T = volume_color_transmittance(sigma_t, t);
+  Spectrum T = volume_color_transmittance(sigma_t, t);
   if (transmittance) {
     *transmittance = T;
   }
@@ -173,8 +167,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
                                               ccl_private Ray &ray,
                                               ccl_private LocalIntersection &ss_isect)
 {
-  float bssrdf_u, bssrdf_v;
-  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+  const float2 rand_bsdf = path_state_rng_2D(kg, &rng_state, PRNG_SUBSURFACE_BSDF);
 
   const float3 P = INTEGRATOR_STATE(state, ray, P);
   const float3 N = INTEGRATOR_STATE(state, ray, D);
@@ -187,7 +180,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   /* Sample diffuse surface scatter into the object. */
   float3 D;
   float pdf;
-  sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf);
+  sample_cos_hemisphere(-N, rand_bsdf.x, rand_bsdf.y, &D, &pdf);
   if (dot(-Ng, D) <= 0.0f) {
     return false;
   }
@@ -205,22 +198,16 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   ray.self.light_object = OBJECT_NONE;
   ray.self.light_prim = PRIM_NONE;
 
-#ifndef __KERNEL_GPU_RAYTRACING__
-  /* Compute or fetch object transforms. */
-  Transform ob_itfm ccl_optional_struct_init;
-  Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
-#endif
-
   /* Convert subsurface to volume coefficients.
    * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
-  const float3 albedo = INTEGRATOR_STATE(state, subsurface, albedo);
-  const float3 radius = INTEGRATOR_STATE(state, subsurface, radius);
+  const Spectrum albedo = INTEGRATOR_STATE(state, subsurface, albedo);
+  const Spectrum radius = INTEGRATOR_STATE(state, subsurface, radius);
   const float anisotropy = INTEGRATOR_STATE(state, subsurface, anisotropy);
 
-  float3 sigma_t, alpha;
-  float3 throughput = INTEGRATOR_STATE_WRITE(state, path, throughput);
+  Spectrum sigma_t, alpha;
+  Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
   subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput);
-  float3 sigma_s = sigma_t * alpha;
+  Spectrum sigma_s = sigma_t * alpha;
 
   /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
    * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
@@ -243,7 +230,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f));
 
   /* Modify state for RNGs, decorrelated from other paths. */
-  rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
+  rng_state.rng_hash = hash_hp_seeded_uint(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
 
   /* Random walk until we hit the surface again. */
   bool hit = false;
@@ -255,10 +242,10 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f));
 
 #ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
-  float3 sigma_s_star = sigma_s * (1.0f - anisotropy);
-  float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star;
-  float3 sigma_t_org = sigma_t;
-  float3 sigma_s_org = sigma_s;
+  Spectrum sigma_s_star = sigma_s * (1.0f - anisotropy);
+  Spectrum sigma_t_star = sigma_t - sigma_s + sigma_s_star;
+  Spectrum sigma_t_org = sigma_t;
+  Spectrum sigma_s_org = sigma_s;
   const float anisotropy_org = anisotropy;
   const float guided_fraction_org = guided_fraction;
 #endif
@@ -270,7 +257,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
 #ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
     // shadow with local variables according to depth
     float anisotropy, guided_fraction;
-    float3 sigma_s, sigma_t;
+    Spectrum sigma_s, sigma_t;
     if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) {
       anisotropy = anisotropy_org;
       guided_fraction = guided_fraction_org;
@@ -286,11 +273,11 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
 #endif
 
     /* Sample color channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL);
-    float3 channel_pdf;
+    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_PHASE_CHANNEL);
+    Spectrum channel_pdf;
     int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
     float sample_sigma_t = volume_channel_get(sigma_t, channel);
-    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE);
+    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_SCATTER_DISTANCE);
 
     /* We need the result of the ray-cast to compute the full guided PDF, so just remember the
      * relevant terms to avoid recomputing them later. */
@@ -303,7 +290,8 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
     /* For the initial ray, we already know the direction, so just do classic distance sampling. */
     if (bounce > 0) {
       /* Decide whether we should use guided or classic sampling. */
-      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction);
+      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_GUIDE_STRATEGY) <
+                     guided_fraction);
 
       /* Determine if we want to sample away from the incoming interface.
        * This only happens if we found a nearby opposite interface, and the probability for it
@@ -317,27 +305,28 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
         float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance);
         backward_fraction = 1.0f /
                             (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length));
-        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction;
+        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_GUIDE_DIRECTION) <
+                         backward_fraction;
       }
 
       /* Sample scattering direction. */
-      float scatter_u, scatter_v;
-      path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+      const float2 rand_scatter = path_state_rng_2D(kg, &rng_state, PRNG_SUBSURFACE_BSDF);
       float cos_theta;
       float hg_pdf;
       if (guided) {
-        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
+        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, rand_scatter.x);
         /* The backwards guiding distribution is just mirrored along `sd->N`, so swapping the
          * sign here is enough to sample from that instead. */
         if (guide_backward) {
           cos_theta = -cos_theta;
         }
-        float3 newD = direction_from_cosine(N, cos_theta, scatter_v);
+        float3 newD = direction_from_cosine(N, cos_theta, rand_scatter.y);
         hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy);
         ray.D = newD;
       }
       else {
-        float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf);
+        float3 newD = henyey_greenstrein_sample(
+            ray.D, anisotropy, rand_scatter.x, rand_scatter.y, &hg_pdf);
         cos_theta = dot(newD, N);
         ray.D = newD;
       }
@@ -363,7 +352,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
       }
     }
 
-    /* Sample direction along ray. */
+    /* Sample distance along ray. */
     float t = -logf(1.0f - randt) / sample_sigma_t;
 
     /* On the first bounce, we use the ray-cast to check if the opposite side is nearby.
@@ -383,15 +372,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
     hit = (ss_isect.num_hits > 0);
 
     if (hit) {
-#ifdef __KERNEL_GPU_RAYTRACING__
-      /* t is always in world space with OptiX and MetalRT. */
       ray.tmax = ss_isect.hits[0].t;
-#else
-      /* Compute world space distance to surface hit. */
-      float3 D = transform_direction(&ob_itfm, ray.D);
-      D = normalize(D) * ss_isect.hits[0].t;
-      ray.tmax = len(transform_direction(&ob_tfm, D));
-#endif
     }
 
     if (bounce == 0) {
@@ -413,16 +394,17 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
     /* Advance to new scatter location. */
     ray.P += t * ray.D;
 
-    float3 transmittance;
-    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
+    Spectrum transmittance;
+    Spectrum pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
     if (bounce > 0) {
       /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
-      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
+      Spectrum guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
 
       if (have_opposite_interface) {
         /* First step of MIS: Depending on geometry we might have two methods for guided
          * sampling, so perform MIS between them. */
-        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
+        Spectrum back_pdf = subsurface_random_walk_pdf(
+            backward_stretching * sigma_t, t, hit, NULL);
         guided_pdf = mix(
             guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
       }
@@ -444,9 +426,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
       /* If we hit the surface, we are done. */
       break;
     }
-    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
+    else if (reduce_max(throughput) < VOLUME_THROUGHPUT_EPSILON) {
       /* Avoid unnecessary work and precision issue when throughput gets really small. */
       break;
     }
@@ -454,6 +434,16 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
 
   if (hit) {
     kernel_assert(isfinite_safe(throughput));
+
+    guiding_record_bssrdf_bounce(
+        kg,
+        state,
+        pdf,
+        N,
+        D,
+        safe_divide_color(throughput, INTEGRATOR_STATE(state, path, throughput)),
+        albedo);
+
     INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput;
   }
 
diff --git a/intern/cycles/kernel/integrator/surface_shader.h b/intern/cycles/kernel/integrator/surface_shader.h
new file mode 100644
index 00000000000..6c0097b11bd
--- /dev/null
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -0,0 +1,860 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+/* Functions to evaluate shaders. */
+
+#pragma once
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/emissive.h"
+
+#include "kernel/integrator/guiding.h"
+
+#ifdef __SVM__
+#  include "kernel/svm/svm.h"
+#endif
+#ifdef __OSL__
+#  include "kernel/osl/osl.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/* Guiding */
+
+#ifdef __PATH_GUIDING__
+ccl_device_inline void surface_shader_prepare_guiding(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      ccl_private ShaderData *sd,
+                                                      ccl_private const RNGState *rng_state)
+{
+  /* Have any BSDF to guide? */
+  if (!(kernel_data.integrator.use_surface_guiding && (sd->flag & SD_BSDF_HAS_EVAL))) {
+    state->guiding.use_surface_guiding = false;
+    return;
+  }
+
+  const float surface_guiding_probability = kernel_data.integrator.surface_guiding_probability;
+  float rand_bsdf_guiding = path_state_rng_1D(kg, rng_state, PRNG_SURFACE_BSDF_GUIDING);
+
+  /* Compute proportion of diffuse BSDF and BSSRDFs .*/
+  float diffuse_sampling_fraction = 0.0f;
+  float bssrdf_sampling_fraction = 0.0f;
+  float bsdf_bssrdf_sampling_sum = 0.0f;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ShaderClosure *sc = &sd->closure[i];
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      const float sweight = sc->sample_weight;
+      kernel_assert(sweight >= 0.0f);
+
+      bsdf_bssrdf_sampling_sum += sweight;
+      if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) && sc->type < CLOSURE_BSDF_TRANSLUCENT_ID) {
+        diffuse_sampling_fraction += sweight;
+      }
+      if (CLOSURE_IS_BSSRDF(sc->type)) {
+        bssrdf_sampling_fraction += sweight;
+      }
+    }
+  }
+
+  if (bsdf_bssrdf_sampling_sum > 0.0f) {
+    diffuse_sampling_fraction /= bsdf_bssrdf_sampling_sum;
+    bssrdf_sampling_fraction /= bsdf_bssrdf_sampling_sum;
+  }
+
+  /* Init guiding (diffuse BSDFs only for now). */
+  if (!(diffuse_sampling_fraction > 0.0f &&
+        guiding_bsdf_init(kg, state, sd->P, sd->N, rand_bsdf_guiding))) {
+    state->guiding.use_surface_guiding = false;
+    return;
+  }
+
+  state->guiding.use_surface_guiding = true;
+  state->guiding.surface_guiding_sampling_prob = surface_guiding_probability *
+                                                 diffuse_sampling_fraction;
+  state->guiding.bssrdf_sampling_prob = bssrdf_sampling_fraction;
+  state->guiding.sample_surface_guiding_rand = rand_bsdf_guiding;
+
+  kernel_assert(state->guiding.surface_guiding_sampling_prob > 0.0f &&
+                state->guiding.surface_guiding_sampling_prob <= 1.0f);
+}
+#endif
+
+ccl_device_inline void surface_shader_prepare_closures(KernelGlobals kg,
+                                                       ConstIntegratorState state,
+                                                       ccl_private ShaderData *sd,
+                                                       const uint32_t path_flag)
+{
+  /* Filter out closures. */
+  if (kernel_data.integrator.filter_closures) {
+    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_EMISSION) {
+      sd->closure_emission_background = zero_spectrum();
+    }
+
+    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIRECT_LIGHT) {
+      sd->flag &= ~SD_BSDF_HAS_EVAL;
+    }
+
+    if (path_flag & PATH_RAY_CAMERA) {
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+
+        if ((CLOSURE_IS_BSDF_DIFFUSE(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE)) ||
+            (CLOSURE_IS_BSDF_GLOSSY(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY)) ||
+            (CLOSURE_IS_BSDF_TRANSMISSION(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION))) {
+          sc->type = CLOSURE_NONE_ID;
+          sc->sample_weight = 0.0f;
+        }
+        else if ((CLOSURE_IS_BSDF_TRANSPARENT(sc->type) &&
+                  (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSPARENT))) {
+          sc->type = CLOSURE_HOLDOUT_ID;
+          sc->sample_weight = 0.0f;
+          sd->flag |= SD_HOLDOUT;
+        }
+      }
+    }
+  }
+
+  /* Defensive sampling.
+   *
+   * We can likely also do defensive sampling at deeper bounces, particularly
+   * for cases like a perfect mirror but possibly also others. This will need
+   * a good heuristic. */
+  if (INTEGRATOR_STATE(state, path, bounce) + INTEGRATOR_STATE(state, path, transparent_bounce) ==
+          0 &&
+      sd->num_closure > 1) {
+    float sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+      }
+    }
+  }
+
+  /* Filter glossy.
+   *
+   * Blurring of bsdf after bounces, for rays that have a small likelihood
+   * of following this particular path (diffuse, rough glossy) */
+  if (kernel_data.integrator.filter_glossy != FLT_MAX
+#ifdef __MNEE__
+      && !(INTEGRATOR_STATE(state, path, mnee) & PATH_MNEE_VALID)
+#endif
+  ) {
+    float blur_pdf = kernel_data.integrator.filter_glossy *
+                     INTEGRATOR_STATE(state, path, min_ray_pdf);
+
+    if (blur_pdf < 1.0f) {
+      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+        if (CLOSURE_IS_BSDF(sc->type)) {
+          bsdf_blur(kg, sc, blur_roughness);
+        }
+      }
+    }
+  }
+}
+
+/* BSDF */
+#if 0
+ccl_device_inline void surface_shader_validate_bsdf_sample(const KernelGlobals kg,
+                                                           const ShaderClosure *sc,
+                                                           const float3 omega_in,
+                                                           const int org_label,
+                                                           const float2 org_roughness,
+                                                           const float org_eta)
+{
+  /* Validate the the bsdf_label and bsdf_roughness_eta functions
+   * by estimating the values after a bsdf sample. */
+  const int comp_label = bsdf_label(kg, sc, omega_in);
+  kernel_assert(org_label == comp_label);
+
+  float2 comp_roughness;
+  float comp_eta;
+  bsdf_roughness_eta(kg, sc, &comp_roughness, &comp_eta);
+  kernel_assert(org_eta == comp_eta);
+  kernel_assert(org_roughness.x == comp_roughness.x);
+  kernel_assert(org_roughness.y == comp_roughness.y);
+}
+#endif
+
+ccl_device_forceinline bool _surface_shader_exclude(ClosureType type, uint light_shader_flags)
+{
+  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+    return false;
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+    if (CLOSURE_IS_BSDF_DIFFUSE(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ccl_device_inline float _surface_shader_bsdf_eval_mis(KernelGlobals kg,
+                                                      ccl_private ShaderData *sd,
+                                                      const float3 omega_in,
+                                                      ccl_private const ShaderClosure *skip_sc,
+                                                      ccl_private BsdfEval *result_eval,
+                                                      float sum_pdf,
+                                                      float sum_sample_weight,
+                                                      const uint light_shader_flags)
+{
+  /* This is the veach one-sample model with balance heuristic,
+   * some PDF factors drop out when using balance heuristic weighting. */
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (sc == skip_sc) {
+      continue;
+    }
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_surface_shader_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        Spectrum eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
+
+        if (bsdf_pdf != 0.0f) {
+          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+        }
+      }
+
+      sum_sample_weight += sc->sample_weight;
+    }
+  }
+
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+}
+
+ccl_device_inline float surface_shader_bsdf_eval_pdfs(const KernelGlobals kg,
+                                                      ccl_private ShaderData *sd,
+                                                      const float3 omega_in,
+                                                      ccl_private BsdfEval *result_eval,
+                                                      ccl_private float *pdfs,
+                                                      const uint light_shader_flags)
+{
+  /* This is the veach one-sample model with balance heuristic, some pdf
+   * factors drop out when using balance heuristic weighting. */
+  float sum_pdf = 0.0f;
+  float sum_sample_weight = 0.0f;
+  bsdf_eval_init(result_eval, CLOSURE_NONE_ID, zero_spectrum());
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_surface_shader_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        Spectrum eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
+        kernel_assert(bsdf_pdf >= 0.0f);
+        if (bsdf_pdf != 0.0f) {
+          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+          kernel_assert(bsdf_pdf * sc->sample_weight >= 0.0f);
+          pdfs[i] = bsdf_pdf * sc->sample_weight;
+        }
+        else {
+          pdfs[i] = 0.0f;
+        }
+      }
+      else {
+        pdfs[i] = 0.0f;
+      }
+
+      sum_sample_weight += sc->sample_weight;
+    }
+    else {
+      pdfs[i] = 0.0f;
+    }
+  }
+  if (sum_pdf > 0.0f) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      pdfs[i] /= sum_pdf;
+    }
+  }
+
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+}
+
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_inline
+#endif
+    float
+    surface_shader_bsdf_eval(KernelGlobals kg,
+                             IntegratorState state,
+                             ccl_private ShaderData *sd,
+                             const float3 omega_in,
+                             ccl_private BsdfEval *bsdf_eval,
+                             const uint light_shader_flags)
+{
+  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, zero_spectrum());
+
+  float pdf = _surface_shader_bsdf_eval_mis(
+      kg, sd, omega_in, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
+
+#if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  if (state->guiding.use_surface_guiding) {
+    const float guiding_sampling_prob = state->guiding.surface_guiding_sampling_prob;
+    const float bssrdf_sampling_prob = state->guiding.bssrdf_sampling_prob;
+    const float guide_pdf = guiding_bsdf_pdf(kg, state, omega_in);
+    pdf = (guiding_sampling_prob * guide_pdf * (1.0f - bssrdf_sampling_prob)) +
+          (1.0f - guiding_sampling_prob) * pdf;
+  }
+#endif
+
+  return pdf;
+}
+
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline ccl_private const ShaderClosure *surface_shader_bsdf_bssrdf_pick(
+    ccl_private const ShaderData *ccl_restrict sd, ccl_private float2 *rand_bsdf)
+{
+  int sampled = 0;
+
+  if (sd->num_closure > 1) {
+    /* Pick a BSDF or based on sample weights. */
+    float sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+
+    float r = (*rand_bsdf).x * sum;
+    float partial_sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        float next_sum = partial_sum + sc->sample_weight;
+
+        if (r < next_sum) {
+          sampled = i;
+
+          /* Rescale to reuse for direction sample, to better preserve stratification. */
+          (*rand_bsdf).x = (r - partial_sum) / sc->sample_weight;
+          break;
+        }
+
+        partial_sum = next_sum;
+      }
+    }
+  }
+
+  return &sd->closure[sampled];
+}
+
+/* Return weight for picked BSSRDF. */
+ccl_device_inline Spectrum
+surface_shader_bssrdf_sample_weight(ccl_private const ShaderData *ccl_restrict sd,
+                                    ccl_private const ShaderClosure *ccl_restrict bssrdf_sc)
+{
+  Spectrum weight = bssrdf_sc->weight;
+
+  if (sd->num_closure > 1) {
+    float sum = 0.0f;
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+    weight *= sum / bssrdf_sc->sample_weight;
+  }
+
+  return weight;
+}
+
+#ifdef __PATH_GUIDING__
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+
+ccl_device int surface_shader_bsdf_guided_sample_closure(KernelGlobals kg,
+                                                         IntegratorState state,
+                                                         ccl_private ShaderData *sd,
+                                                         ccl_private const ShaderClosure *sc,
+                                                         const float2 rand_bsdf,
+                                                         ccl_private BsdfEval *bsdf_eval,
+                                                         ccl_private float3 *omega_in,
+                                                         ccl_private float *bsdf_pdf,
+                                                         ccl_private float *unguided_bsdf_pdf,
+                                                         ccl_private float2 *sampled_rougness,
+                                                         ccl_private float *eta)
+{
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
+
+  const bool use_surface_guiding = state->guiding.use_surface_guiding;
+  const float guiding_sampling_prob = state->guiding.surface_guiding_sampling_prob;
+  const float bssrdf_sampling_prob = state->guiding.bssrdf_sampling_prob;
+
+  /* Decide between sampling guiding distribution and BSDF. */
+  bool sample_guiding = false;
+  float rand_bsdf_guiding = state->guiding.sample_surface_guiding_rand;
+
+  if (use_surface_guiding && rand_bsdf_guiding < guiding_sampling_prob) {
+    sample_guiding = true;
+    rand_bsdf_guiding /= guiding_sampling_prob;
+  }
+  else {
+    rand_bsdf_guiding -= guiding_sampling_prob;
+    rand_bsdf_guiding /= (1.0f - guiding_sampling_prob);
+  }
+
+  /* Initialize to zero. */
+  int label = LABEL_NONE;
+  Spectrum eval = zero_spectrum();
+  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, eval);
+
+  *unguided_bsdf_pdf = 0.0f;
+  float guide_pdf = 0.0f;
+
+  if (sample_guiding) {
+    /* Sample guiding distribution. */
+    guide_pdf = guiding_bsdf_sample(kg, state, rand_bsdf, omega_in);
+    *bsdf_pdf = 0.0f;
+
+    if (guide_pdf != 0.0f) {
+      float unguided_bsdf_pdfs[MAX_CLOSURE];
+
+      *unguided_bsdf_pdf = surface_shader_bsdf_eval_pdfs(
+          kg, sd, *omega_in, bsdf_eval, unguided_bsdf_pdfs, 0);
+      *bsdf_pdf = (guiding_sampling_prob * guide_pdf * (1.0f - bssrdf_sampling_prob)) +
+                  ((1.0f - guiding_sampling_prob) * (*unguided_bsdf_pdf));
+      float sum_pdfs = 0.0f;
+
+      if (*unguided_bsdf_pdf > 0.0f) {
+        int idx = -1;
+        for (int i = 0; i < sd->num_closure; i++) {
+          sum_pdfs += unguided_bsdf_pdfs[i];
+          if (rand_bsdf_guiding <= sum_pdfs) {
+            idx = i;
+            break;
+          }
+        }
+
+        kernel_assert(idx >= 0);
+        /* Set the default idx to the last in the list.
+         * in case of numerical problems and rand_bsdf_guiding is just >=1.0f and
+         * the sum of all unguided_bsdf_pdfs is just < 1.0f. */
+        idx = (rand_bsdf_guiding > sum_pdfs) ? sd->num_closure - 1 : idx;
+
+        label = bsdf_label(kg, &sd->closure[idx], *omega_in);
+      }
+    }
+
+    kernel_assert(reduce_min(bsdf_eval_sum(bsdf_eval)) >= 0.0f);
+
+    *sampled_rougness = make_float2(1.0f, 1.0f);
+    *eta = 1.0f;
+  }
+  else {
+    /* Sample BSDF. */
+    *bsdf_pdf = 0.0f;
+    label = bsdf_sample(kg,
+                        sd,
+                        sc,
+                        rand_bsdf.x,
+                        rand_bsdf.y,
+                        &eval,
+                        omega_in,
+                        unguided_bsdf_pdf,
+                        sampled_rougness,
+                        eta);
+#  if 0
+    if (*unguided_bsdf_pdf > 0.0f) {
+      surface_shader_validate_bsdf_sample(kg, sc, *omega_in, label, sampled_roughness, eta);
+    }
+#  endif
+
+    if (*unguided_bsdf_pdf != 0.0f) {
+      bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);
+
+      kernel_assert(reduce_min(bsdf_eval_sum(bsdf_eval)) >= 0.0f);
+
+      if (sd->num_closure > 1) {
+        float sweight = sc->sample_weight;
+        *unguided_bsdf_pdf = _surface_shader_bsdf_eval_mis(
+            kg, sd, *omega_in, sc, bsdf_eval, (*unguided_bsdf_pdf) * sweight, sweight, 0);
+        kernel_assert(reduce_min(bsdf_eval_sum(bsdf_eval)) >= 0.0f);
+      }
+      *bsdf_pdf = *unguided_bsdf_pdf;
+
+      if (use_surface_guiding) {
+        guide_pdf = guiding_bsdf_pdf(kg, state, *omega_in);
+        *bsdf_pdf *= 1.0f - guiding_sampling_prob;
+        *bsdf_pdf += guiding_sampling_prob * guide_pdf * (1.0f - bssrdf_sampling_prob);
+      }
+    }
+
+    kernel_assert(reduce_min(bsdf_eval_sum(bsdf_eval)) >= 0.0f);
+  }
+
+  return label;
+}
+#endif
+
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int surface_shader_bsdf_sample_closure(KernelGlobals kg,
+                                                  ccl_private ShaderData *sd,
+                                                  ccl_private const ShaderClosure *sc,
+                                                  const float2 rand_bsdf,
+                                                  ccl_private BsdfEval *bsdf_eval,
+                                                  ccl_private float3 *omega_in,
+                                                  ccl_private float *pdf,
+                                                  ccl_private float2 *sampled_roughness,
+                                                  ccl_private float *eta)
+{
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
+
+  int label;
+  Spectrum eval = zero_spectrum();
+
+  *pdf = 0.0f;
+  label = bsdf_sample(
+      kg, sd, sc, rand_bsdf.x, rand_bsdf.y, &eval, omega_in, pdf, sampled_roughness, eta);
+
+  if (*pdf != 0.0f) {
+    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);
+
+    if (sd->num_closure > 1) {
+      float sweight = sc->sample_weight;
+      *pdf = _surface_shader_bsdf_eval_mis(
+          kg, sd, *omega_in, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+    }
+  }
+  else {
+    bsdf_eval_init(bsdf_eval, sc->type, zero_spectrum());
+  }
+
+  return label;
+}
+
+ccl_device float surface_shader_average_roughness(ccl_private const ShaderData *sd)
+{
+  float roughness = 0.0f;
+  float sum_weight = 0.0f;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF(sc->type)) {
+      /* sqrt once to undo the squaring from multiplying roughness on the
+       * two axes, and once for the squared roughness convention. */
+      float weight = fabsf(average(sc->weight));
+      roughness += weight * sqrtf(safe_sqrtf(bsdf_get_roughness_squared(sc)));
+      sum_weight += weight;
+    }
+  }
+
+  return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
+}
+
+ccl_device Spectrum surface_shader_transparency(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_HAS_ONLY_VOLUME) {
+    return one_spectrum();
+  }
+  else if (sd->flag & SD_TRANSPARENT) {
+    return sd->closure_transparent_extinction;
+  }
+  else {
+    return zero_spectrum();
+  }
+}
+
+ccl_device void surface_shader_disable_transparency(KernelGlobals kg, ccl_private ShaderData *sd)
+{
+  if (sd->flag & SD_TRANSPARENT) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+
+      if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+        sc->sample_weight = 0.0f;
+        sc->weight = zero_spectrum();
+      }
+    }
+
+    sd->flag &= ~SD_TRANSPARENT;
+  }
+}
+
+ccl_device Spectrum surface_shader_alpha(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum alpha = one_spectrum() - surface_shader_transparency(kg, sd);
+
+  alpha = saturate(alpha);
+
+  return alpha;
+}
+
+ccl_device Spectrum surface_shader_diffuse(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum eval = zero_spectrum();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device Spectrum surface_shader_glossy(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum eval = zero_spectrum();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device Spectrum surface_shader_transmission(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum eval = zero_spectrum();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device float3 surface_shader_average_normal(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+      N += sc->N * fabsf(average(sc->weight));
+  }
+
+  return (is_zero(N)) ? sd->N : normalize(N);
+}
+
+ccl_device Spectrum surface_shader_ao(KernelGlobals kg,
+                                      ccl_private const ShaderData *sd,
+                                      const float ao_factor,
+                                      ccl_private float3 *N_)
+{
+  Spectrum eval = zero_spectrum();
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+      ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
+      eval += sc->weight * ao_factor;
+      N += bsdf->N * fabsf(average(sc->weight));
+    }
+  }
+
+  *N_ = (is_zero(N)) ? sd->N : normalize(N);
+  return eval;
+}
+
+#ifdef __SUBSURFACE__
+ccl_device float3 surface_shader_bssrdf_normal(ccl_private const ShaderData *sd)
+{
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSSRDF(sc->type)) {
+      ccl_private const Bssrdf *bssrdf = (ccl_private const Bssrdf *)sc;
+      float avg_weight = fabsf(average(sc->weight));
+
+      N += bssrdf->N * avg_weight;
+    }
+  }
+
+  return (is_zero(N)) ? sd->N : normalize(N);
+}
+#endif /* __SUBSURFACE__ */
+
+/* Constant emission optimization */
+
+ccl_device bool surface_shader_constant_emission(KernelGlobals kg,
+                                                 int shader,
+                                                 ccl_private Spectrum *eval)
+{
+  int shader_index = shader & SHADER_MASK;
+  int shader_flag = kernel_data_fetch(shaders, shader_index).flags;
+
+  if (shader_flag & SD_HAS_CONSTANT_EMISSION) {
+    const float3 emission_rgb = make_float3(
+        kernel_data_fetch(shaders, shader_index).constant_emission[0],
+        kernel_data_fetch(shaders, shader_index).constant_emission[1],
+        kernel_data_fetch(shaders, shader_index).constant_emission[2]);
+    *eval = rgb_to_spectrum(emission_rgb);
+
+    return true;
+  }
+
+  return false;
+}
+
+/* Background */
+
+ccl_device Spectrum surface_shader_background(ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_EMISSION) {
+    return sd->closure_emission_background;
+  }
+  else {
+    return zero_spectrum();
+  }
+}
+
+/* Emission */
+
+ccl_device Spectrum surface_shader_emission(ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_EMISSION) {
+    return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
+  }
+  else {
+    return zero_spectrum();
+  }
+}
+
+/* Holdout */
+
+ccl_device Spectrum surface_shader_apply_holdout(KernelGlobals kg, ccl_private ShaderData *sd)
+{
+  Spectrum weight = zero_spectrum();
+
+  /* For objects marked as holdout, preserve transparency and remove all other
+   * closures, replacing them with a holdout weight. */
+  if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    if ((sd->flag & SD_TRANSPARENT) && !(sd->flag & SD_HAS_ONLY_VOLUME)) {
+      weight = one_spectrum() - sd->closure_transparent_extinction;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+        if (!CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+          sc->type = NBUILTIN_CLOSURES;
+        }
+      }
+
+      sd->flag &= ~(SD_CLOSURE_FLAGS - (SD_TRANSPARENT | SD_BSDF));
+    }
+    else {
+      weight = one_spectrum();
+    }
+  }
+  else {
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_HOLDOUT(sc->type)) {
+        weight += sc->weight;
+      }
+    }
+  }
+
+  return weight;
+}
+
+/* Surface Evaluation */
+
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
+ccl_device void surface_shader_eval(KernelGlobals kg,
+                                    ConstIntegratorGenericState state,
+                                    ccl_private ShaderData *ccl_restrict sd,
+                                    ccl_global float *ccl_restrict buffer,
+                                    uint32_t path_flag,
+                                    bool use_caustics_storage = false)
+{
+  /* If path is being terminated, we are tracing a shadow ray or evaluating
+   * emission, then we don't need to store closures. The emission and shadow
+   * shader data also do not have a closure array to save GPU memory. */
+  int max_closures;
+  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+    max_closures = 0;
+  }
+  else {
+    max_closures = use_caustics_storage ? CAUSTICS_MAX_CLOSURE : kernel_data.max_closures;
+  }
+
+  sd->num_closure = 0;
+  sd->num_closure_left = max_closures;
+
+#ifdef __OSL__
+  if (kg->osl) {
+    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+      OSLShader::eval_background(kg, state, sd, path_flag);
+    }
+    else {
+      OSLShader::eval_surface(kg, state, sd, path_flag);
+    }
+  }
+  else
+#endif
+  {
+#ifdef __SVM__
+    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(kg, state, sd, buffer, path_flag);
+#else
+    if (sd->object == OBJECT_NONE) {
+      sd->closure_emission_background = make_spectrum(0.8f);
+      sd->flag |= SD_EMISSION;
+    }
+    else {
+      ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
+          sd, sizeof(DiffuseBsdf), make_spectrum(0.8f));
+      if (bsdf != NULL) {
+        bsdf->N = sd->N;
+        sd->flag |= bsdf_diffuse_setup(bsdf);
+      }
+    }
+#endif
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/volume_shader.h b/intern/cycles/kernel/integrator/volume_shader.h
new file mode 100644
index 00000000000..0ff968723a1
--- /dev/null
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@@ -0,0 +1,519 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+/* Volume shader evaluation and sampling. */
+
+#pragma once
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/emissive.h"
+
+#ifdef __SVM__
+#  include "kernel/svm/svm.h"
+#endif
+#ifdef __OSL__
+#  include "kernel/osl/osl.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Merging */
+
+ccl_device_inline void volume_shader_merge_closures(ccl_private ShaderData *sd)
+{
+  /* Merge identical closures to save closure space with stacked volumes. */
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private ShaderClosure *sci = &sd->closure[i];
+
+    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      continue;
+    }
+
+    for (int j = i + 1; j < sd->num_closure; j++) {
+      ccl_private ShaderClosure *scj = &sd->closure[j];
+      if (sci->type != scj->type) {
+        continue;
+      }
+
+      ccl_private const HenyeyGreensteinVolume *hgi = (ccl_private const HenyeyGreensteinVolume *)
+          sci;
+      ccl_private const HenyeyGreensteinVolume *hgj = (ccl_private const HenyeyGreensteinVolume *)
+          scj;
+      if (!(hgi->g == hgj->g)) {
+        continue;
+      }
+
+      sci->weight += scj->weight;
+      sci->sample_weight += scj->sample_weight;
+
+      int size = sd->num_closure - (j + 1);
+      if (size > 0) {
+        for (int k = 0; k < size; k++) {
+          scj[k] = scj[k + 1];
+        }
+      }
+
+      sd->num_closure--;
+      kernel_assert(sd->num_closure >= 0);
+      j--;
+    }
+  }
+}
+
+ccl_device_inline void volume_shader_copy_phases(ccl_private ShaderVolumePhases *ccl_restrict
+                                                     phases,
+                                                 ccl_private const ShaderData *ccl_restrict sd)
+{
+  phases->num_closure = 0;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *from_sc = &sd->closure[i];
+    ccl_private const HenyeyGreensteinVolume *from_hg =
+        (ccl_private const HenyeyGreensteinVolume *)from_sc;
+
+    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      ccl_private ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+      to_sc->weight = from_sc->weight;
+      to_sc->sample_weight = from_sc->sample_weight;
+      to_sc->g = from_hg->g;
+      phases->num_closure++;
+      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+        break;
+      }
+    }
+  }
+}
+
+/* Guiding */
+
+#  ifdef __PATH_GUIDING__
+ccl_device_inline void volume_shader_prepare_guiding(KernelGlobals kg,
+                                                     IntegratorState state,
+                                                     ccl_private ShaderData *sd,
+                                                     ccl_private const RNGState *rng_state,
+                                                     const float3 P,
+                                                     const float3 D,
+                                                     ccl_private ShaderVolumePhases *phases,
+                                                     const VolumeSampleMethod direct_sample_method)
+{
+  /* Have any phase functions to guide? */
+  const int num_phases = phases->num_closure;
+  if (!kernel_data.integrator.use_volume_guiding || num_phases == 0) {
+    state->guiding.use_volume_guiding = false;
+    return;
+  }
+
+  const float volume_guiding_probability = kernel_data.integrator.volume_guiding_probability;
+  float rand_phase_guiding = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_PHASE_GUIDING);
+
+  /* If we have more than one phase function we select one random based on its
+   * sample weight to calculate the product distribution for guiding. */
+  int phase_id = 0;
+  float phase_weight = 1.0f;
+
+  if (num_phases > 1) {
+    /* Pick a phase closure based on sample weights. */
+    float sum = 0.0f;
+
+    for (phase_id = 0; phase_id < num_phases; phase_id++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[phase_id];
+      sum += svc->sample_weight;
+    }
+
+    float r = rand_phase_guiding * sum;
+    float partial_sum = 0.0f;
+
+    for (phase_id = 0; phase_id < num_phases; phase_id++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[phase_id];
+      float next_sum = partial_sum + svc->sample_weight;
+
+      if (r <= next_sum) {
+        /* Rescale to reuse. */
+        rand_phase_guiding = (r - partial_sum) / svc->sample_weight;
+        phase_weight = svc->sample_weight / sum;
+        break;
+      }
+
+      partial_sum = next_sum;
+    }
+
+    /* Adjust the sample weight of the component used for guiding. */
+    phases->closure[phase_id].sample_weight *= volume_guiding_probability;
+  }
+
+  /* Init guiding for selected phase function. */
+  ccl_private const ShaderVolumeClosure *svc = &phases->closure[phase_id];
+  if (!guiding_phase_init(kg, state, P, D, svc->g, rand_phase_guiding)) {
+    state->guiding.use_volume_guiding = false;
+    return;
+  }
+
+  state->guiding.use_volume_guiding = true;
+  state->guiding.sample_volume_guiding_rand = rand_phase_guiding;
+  state->guiding.volume_guiding_sampling_prob = volume_guiding_probability * phase_weight;
+
+  kernel_assert(state->guiding.volume_guiding_sampling_prob > 0.0f &&
+                state->guiding.volume_guiding_sampling_prob <= 1.0f);
+}
+#  endif
+
+/* Phase Evaluation & Sampling */
+
+/* Randomly sample a volume phase function proportional to ShaderClosure.sample_weight. */
+ccl_device_inline ccl_private const ShaderVolumeClosure *volume_shader_phase_pick(
+    ccl_private const ShaderVolumePhases *phases, ccl_private float2 *rand_phase)
+{
+  int sampled = 0;
+
+  if (phases->num_closure > 1) {
+    /* pick a phase closure based on sample weights */
+    float sum = 0.0f;
+
+    for (int i = 0; i < phases->num_closure; i++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      sum += svc->sample_weight;
+    }
+
+    float r = (*rand_phase).x * sum;
+    float partial_sum = 0.0f;
+
+    for (int i = 0; i < phases->num_closure; i++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[i];
+      float next_sum = partial_sum + svc->sample_weight;
+
+      if (r <= next_sum) {
+        /* Rescale to reuse for volume phase direction sample. */
+        sampled = i;
+        (*rand_phase).x = (r - partial_sum) / svc->sample_weight;
+        break;
+      }
+
+      partial_sum = next_sum;
+    }
+  }
+
+  /* todo: this isn't quite correct, we don't weight anisotropy properly
+   * depending on color channels, even if this is perhaps not a common case */
+  return &phases->closure[sampled];
+}
+
+ccl_device_inline float _volume_shader_phase_eval_mis(ccl_private const ShaderData *sd,
+                                                      ccl_private const ShaderVolumePhases *phases,
+                                                      const float3 omega_in,
+                                                      int skip_phase,
+                                                      ccl_private BsdfEval *result_eval,
+                                                      float sum_pdf,
+                                                      float sum_sample_weight)
+{
+  for (int i = 0; i < phases->num_closure; i++) {
+    if (i == skip_phase)
+      continue;
+
+    ccl_private const ShaderVolumeClosure *svc = &phases->closure[i];
+    float phase_pdf = 0.0f;
+    Spectrum eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
+
+    if (phase_pdf != 0.0f) {
+      bsdf_eval_accum(result_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+      sum_pdf += phase_pdf * svc->sample_weight;
+    }
+
+    sum_sample_weight += svc->sample_weight;
+  }
+
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+}
+
+ccl_device float volume_shader_phase_eval(KernelGlobals kg,
+                                          ccl_private const ShaderData *sd,
+                                          ccl_private const ShaderVolumeClosure *svc,
+                                          const float3 omega_in,
+                                          ccl_private BsdfEval *phase_eval)
+{
+  float phase_pdf = 0.0f;
+  Spectrum eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
+
+  if (phase_pdf != 0.0f) {
+    bsdf_eval_accum(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+  }
+
+  return phase_pdf;
+}
+
+ccl_device float volume_shader_phase_eval(KernelGlobals kg,
+                                          IntegratorState state,
+                                          ccl_private const ShaderData *sd,
+                                          ccl_private const ShaderVolumePhases *phases,
+                                          const float3 omega_in,
+                                          ccl_private BsdfEval *phase_eval)
+{
+  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_spectrum());
+
+  float pdf = _volume_shader_phase_eval_mis(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
+
+#  if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+  if (state->guiding.use_volume_guiding) {
+    const float guiding_sampling_prob = state->guiding.volume_guiding_sampling_prob;
+    const float guide_pdf = guiding_phase_pdf(kg, state, omega_in);
+    pdf = (guiding_sampling_prob * guide_pdf) + (1.0f - guiding_sampling_prob) * pdf;
+  }
+#  endif
+
+  return pdf;
+}
+
+#  ifdef __PATH_GUIDING__
+ccl_device int volume_shader_phase_guided_sample(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 ccl_private const ShaderData *sd,
+                                                 ccl_private const ShaderVolumeClosure *svc,
+                                                 const float2 rand_phase,
+                                                 ccl_private BsdfEval *phase_eval,
+                                                 ccl_private float3 *omega_in,
+                                                 ccl_private float *phase_pdf,
+                                                 ccl_private float *unguided_phase_pdf,
+                                                 ccl_private float *sampled_roughness)
+{
+  const bool use_volume_guiding = state->guiding.use_volume_guiding;
+  const float guiding_sampling_prob = state->guiding.volume_guiding_sampling_prob;
+
+  /* Decide between sampling guiding distribution and phase. */
+  float rand_phase_guiding = state->guiding.sample_volume_guiding_rand;
+  bool sample_guiding = false;
+  if (use_volume_guiding && rand_phase_guiding < guiding_sampling_prob) {
+    sample_guiding = true;
+    rand_phase_guiding /= guiding_sampling_prob;
+  }
+  else {
+    rand_phase_guiding -= guiding_sampling_prob;
+    rand_phase_guiding /= (1.0f - guiding_sampling_prob);
+  }
+
+  /* Initialize to zero. */
+  int label = LABEL_NONE;
+  Spectrum eval = zero_spectrum();
+
+  *unguided_phase_pdf = 0.0f;
+  float guide_pdf = 0.0f;
+  *sampled_roughness = 1.0f - fabsf(svc->g);
+
+  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_spectrum());
+
+  if (sample_guiding) {
+    /* Sample guiding distribution. */
+    guide_pdf = guiding_phase_sample(kg, state, rand_phase, omega_in);
+    *phase_pdf = 0.0f;
+
+    if (guide_pdf != 0.0f) {
+      *unguided_phase_pdf = volume_shader_phase_eval(kg, sd, svc, *omega_in, phase_eval);
+      *phase_pdf = (guiding_sampling_prob * guide_pdf) +
+                   ((1.0f - guiding_sampling_prob) * (*unguided_phase_pdf));
+      label = LABEL_VOLUME_SCATTER;
+    }
+  }
+  else {
+    /* Sample phase. */
+    *phase_pdf = 0.0f;
+    label = volume_phase_sample(
+        sd, svc, rand_phase.x, rand_phase.y, &eval, omega_in, unguided_phase_pdf);
+
+    if (*unguided_phase_pdf != 0.0f) {
+      bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+
+      *phase_pdf = *unguided_phase_pdf;
+      if (use_volume_guiding) {
+        guide_pdf = guiding_phase_pdf(kg, state, *omega_in);
+        *phase_pdf *= 1.0f - guiding_sampling_prob;
+        *phase_pdf += guiding_sampling_prob * guide_pdf;
+      }
+
+      kernel_assert(reduce_min(bsdf_eval_sum(phase_eval)) >= 0.0f);
+    }
+    else {
+      bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_spectrum());
+    }
+
+    kernel_assert(reduce_min(bsdf_eval_sum(phase_eval)) >= 0.0f);
+  }
+
+  return label;
+}
+#  endif
+
+ccl_device int volume_shader_phase_sample(KernelGlobals kg,
+                                          ccl_private const ShaderData *sd,
+                                          ccl_private const ShaderVolumePhases *phases,
+                                          ccl_private const ShaderVolumeClosure *svc,
+                                          float2 rand_phase,
+                                          ccl_private BsdfEval *phase_eval,
+                                          ccl_private float3 *omega_in,
+                                          ccl_private float *pdf,
+                                          ccl_private float *sampled_roughness)
+{
+  *sampled_roughness = 1.0f - fabsf(svc->g);
+  Spectrum eval = zero_spectrum();
+
+  *pdf = 0.0f;
+  int label = volume_phase_sample(sd, svc, rand_phase.x, rand_phase.y, &eval, omega_in, pdf);
+
+  if (*pdf != 0.0f) {
+    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+  }
+
+  return label;
+}
+
+/* Motion Blur */
+
+#  ifdef __OBJECT_MOTION__
+ccl_device_inline void volume_shader_motion_blur(KernelGlobals kg,
+                                                 ccl_private ShaderData *ccl_restrict sd)
+{
+  if ((sd->object_flag & SD_OBJECT_HAS_VOLUME_MOTION) == 0) {
+    return;
+  }
+
+  AttributeDescriptor v_desc = find_attribute(kg, sd, ATTR_STD_VOLUME_VELOCITY);
+  kernel_assert(v_desc.offset != ATTR_STD_NOT_FOUND);
+
+  const float3 P = sd->P;
+  const float velocity_scale = kernel_data_fetch(objects, sd->object).velocity_scale;
+  const float time_offset = kernel_data.cam.motion_position == MOTION_POSITION_CENTER ? 0.5f :
+                                                                                        0.0f;
+  const float time = kernel_data.cam.motion_position == MOTION_POSITION_END ?
+                         (1.0f - kernel_data.cam.shuttertime) + sd->time :
+                         sd->time;
+
+  /* Use a 1st order semi-lagrangian advection scheme to estimate what volume quantity
+   * existed, or will exist, at the given time:
+   *
+   * `phi(x, T) = phi(x - (T - t) * u(x, T), t)`
+   *
+   * where
+   *
+   * x : position
+   * T : super-sampled time (or ray time)
+   * t : current time of the simulation (in rendering we assume this is center frame with
+   * relative time = 0)
+   * phi : the volume quantity
+   * u : the velocity field
+   *
+   * But first we need to determine the velocity field `u(x, T)`, which we can estimate also
+   * using semi-lagrangian advection.
+   *
+   * `u(x, T) = u(x - (T - t) * u(x, T), t)`
+   *
+   * This is the typical way to model self-advection in fluid dynamics, however, we do not
+   * account for other forces affecting the velocity during simulation (pressure, buoyancy,
+   * etc.): this gives a linear interpolation when fluid are mostly "curvy". For better
+   * results, a higher order interpolation scheme can be used (at the cost of more lookups),
+   * or an interpolation of the velocity fields for the previous and next frames could also
+   * be used to estimate `u(x, T)` (which will cost more memory and lookups).
+   *
+   * References:
+   * "Eulerian Motion Blur", Kim and Ko, 2007
+   * "Production Volume Rendering", Wreninge et al., 2012
+   */
+
+  /* Find velocity. */
+  float3 velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
+  object_dir_transform(kg, sd, &velocity);
+
+  /* Find advected P. */
+  sd->P = P - (time - time_offset) * velocity_scale * velocity;
+
+  /* Find advected velocity. */
+  velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
+  object_dir_transform(kg, sd, &velocity);
+
+  /* Find advected P. */
+  sd->P = P - (time - time_offset) * velocity_scale * velocity;
+}
+#  endif
+
+/* Volume Evaluation */
+
+template<const bool shadow, typename StackReadOp, typename ConstIntegratorGenericState>
+ccl_device_inline void volume_shader_eval(KernelGlobals kg,
+                                          ConstIntegratorGenericState state,
+                                          ccl_private ShaderData *ccl_restrict sd,
+                                          const uint32_t path_flag,
+                                          StackReadOp stack_read)
+{
+  /* If path is being terminated, we are tracing a shadow ray or evaluating
+   * emission, then we don't need to store closures. The emission and shadow
+   * shader data also do not have a closure array to save GPU memory. */
+  int max_closures;
+  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+    max_closures = 0;
+  }
+  else {
+    max_closures = kernel_data.max_closures;
+  }
+
+  /* reset closures once at the start, we will be accumulating the closures
+   * for all volumes in the stack into a single array of closures */
+  sd->num_closure = 0;
+  sd->num_closure_left = max_closures;
+  sd->flag = 0;
+  sd->object_flag = 0;
+
+  for (int i = 0;; i++) {
+    const VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    /* Setup shader-data from stack. it's mostly setup already in
+     * shader_setup_from_volume, this switching should be quick. */
+    sd->object = entry.object;
+    sd->lamp = LAMP_NONE;
+    sd->shader = entry.shader;
+
+    sd->flag &= ~SD_SHADER_FLAGS;
+    sd->flag |= kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).flags;
+    sd->object_flag &= ~SD_OBJECT_FLAGS;
+
+    if (sd->object != OBJECT_NONE) {
+      sd->object_flag |= kernel_data_fetch(object_flag, sd->object);
+
+#  ifdef __OBJECT_MOTION__
+      /* todo: this is inefficient for motion blur, we should be
+       * caching matrices instead of recomputing them each step */
+      shader_setup_object_transforms(kg, sd, sd->time);
+
+      volume_shader_motion_blur(kg, sd);
+#  endif
+    }
+
+    /* evaluate shader */
+#  ifdef __OSL__
+    if (kg->osl) {
+      OSLShader::eval_volume(kg, state, sd, path_flag);
+    }
+    else
+#  endif
+    {
+#  ifdef __SVM__
+      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+          kg, state, sd, NULL, path_flag);
+#  endif
+    }
+
+    /* Merge closures to avoid exceeding number of closures limit. */
+    if (!shadow) {
+      if (i > 0) {
+        volume_shader_merge_closures(sd);
+      }
+    }
+  }
+}
+
+#endif /* __VOLUME__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/volume_stack.h b/intern/cycles/kernel/integrator/volume_stack.h
index 97a0f0f386c..675e1927fc0 100644
--- a/intern/cycles/kernel/integrator/volume_stack.h
+++ b/intern/cycles/kernel/integrator/volume_stack.h
@@ -39,7 +39,7 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
         break;
       }
 
-      if (entry.object == sd->object) {
+      if (entry.object == sd->object && entry.shader == sd->shader) {
         /* Shift back next stack entries. */
         do {
           entry = stack_read(i + 1);
@@ -61,7 +61,7 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
       }
 
       /* Already in the stack? then we have nothing to do. */
-      if (entry.object == sd->object) {
+      if (entry.object == sd->object && entry.shader == sd->shader) {
         return;
       }
     }