30 files changed, 550 insertions, 277 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index bc380f269ad..18aa5dda70d 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -52,7 +52,11 @@ static size_t estimate_single_state_size()
  * For until then use common value. Currently this size is only used for logging, but is weak to
  * rely on this. */
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@@ -74,6 +78,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
           device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_path_index_(
+          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
       integrator_next_shadow_catcher_path_index_(
           device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
       queued_paths_(device, "queued_paths", MEM_READ_WRITE),
@@ -138,7 +144,11 @@ void PathTraceWorkGPU::alloc_integrator_soa()
   } \
   }
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@@ -199,16 +209,22 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
 
 void PathTraceWorkGPU::alloc_integrator_path_split()
 {
-  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
-    return;
+  if (integrator_next_shadow_path_index_.size() == 0) {
+    integrator_next_shadow_path_index_.alloc(1);
+    integrator_next_shadow_path_index_.zero_to_device();
+
+    integrator_state_gpu_.next_shadow_path_index =
+        (int *)integrator_next_shadow_path_index_.device_pointer;
   }
 
-  integrator_next_shadow_catcher_path_index_.alloc(1);
-  /* TODO(sergey): Use queue? */
-  integrator_next_shadow_catcher_path_index_.zero_to_device();
+  if (integrator_next_shadow_catcher_path_index_.size() == 0) {
+    integrator_next_shadow_catcher_path_index_.alloc(1);
+    integrator_next_shadow_path_index_.data()[0] = 0;
+    integrator_next_shadow_catcher_path_index_.zero_to_device();
 
-  integrator_state_gpu_.next_shadow_catcher_path_index =
-      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+    integrator_state_gpu_.next_shadow_catcher_path_index =
+        (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+  }
 }
 
 void PathTraceWorkGPU::alloc_work_memory()
@@ -341,27 +357,45 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
     return false;
   }
 
-  /* Finish shadows before potentially adding more shadow rays. We can only
-   * store one shadow ray in the integrator state.
+  /* If the number of shadow kernels dropped to zero, set the next shadow path
+   * index to zero as well.
    *
-   * When there is a shadow catcher in the scene finish shadow rays before invoking intersect
-   * closest kernel since so that the shadow paths are writing to the pre-split state. */
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
-      (has_shadow_catcher() && kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST)) {
-    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
-      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
-      return true;
+   * TODO: use shadow path compaction to lower it more often instead of letting
+   * it fill up entirely? */
+  const int num_queued_shadow =
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] +
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW];
+  if (num_queued_shadow == 0) {
+    if (integrator_next_shadow_path_index_.data()[0] != 0) {
+      integrator_next_shadow_path_index_.data()[0] = 0;
+      queue_->copy_to_device(integrator_next_shadow_path_index_);
     }
-    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
-      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
-      return true;
+  }
+
+  /* For kernels that add shadow paths, check if there is enough space available.
+   * If not, schedule shadow kernels first to clear out the shadow paths. */
+  if (kernel_creates_shadow_paths(kernel)) {
+    if (max_num_paths_ - integrator_next_shadow_path_index_.data()[0] <
+        queue_counter->num_queued[kernel]) {
+      if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+        enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+        return true;
+      }
+      else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+        enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+        return true;
+      }
     }
   }
 
   /* Schedule kernel with maximum number of queued items. */
   enqueue_path_iteration(kernel);
+
+  /* Update next shadow path index for kernels that can add shadow paths. */
+  if (kernel_creates_shadow_paths(kernel)) {
+    queue_->copy_from_device(integrator_next_shadow_path_index_);
+  }
+
   return true;
 }
 
@@ -370,13 +404,12 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
   void *d_path_index = (void *)NULL;
 
   /* Create array of path indices for which this kernel is queued to be executed. */
-  int work_size = max_active_path_index_;
+  int work_size = kernel_max_active_path_index(kernel);
 
   IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
   int num_queued = queue_counter->num_queued[kernel];
 
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+  if (kernel_uses_sorting(kernel)) {
     /* Compute array of active paths, sorted by shader. */
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
@@ -387,8 +420,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
 
-    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    if (kernel_is_shadow_path(kernel)) {
       /* Compute array of active shadow paths for specific kernel. */
       compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
     }
@@ -452,7 +484,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
   {
     /* TODO: this could be smaller for terminated paths based on amount of work we want
      * to schedule. */
-    const int work_size = max_active_path_index_;
+    const int work_size = kernel_max_active_path_index(queued_kernel);
 
     void *d_queued_paths = (void *)queued_paths_.device_pointer;
     void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
@@ -481,7 +513,7 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu
   int d_queued_kernel = queued_kernel;
 
   /* Launch kernel to fill the active paths arrays. */
-  const int work_size = max_active_path_index_;
+  const int work_size = kernel_max_active_path_index(queued_kernel);
   void *d_queued_paths = (void *)queued_paths_.device_pointer;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
   void *args[] = {
@@ -981,4 +1013,29 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
   return num_queued_paths_.data()[0];
 }
 
+bool PathTraceWorkGPU::kernel_uses_sorting(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
+bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+}
+
+bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+}
+
+int PathTraceWorkGPU::kernel_max_active_path_index(DeviceKernel kernel)
+{
+  return (kernel_is_shadow_path(kernel)) ? integrator_next_shadow_path_index_.data()[0] :
+                                           max_active_path_index_;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index e66851cc8d8..dd2c1c197ae 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -113,6 +113,12 @@ class PathTraceWorkGPU : public PathTraceWork {
   /* Count how many currently scheduled paths can still split. */
   int shadow_catcher_count_possible_splits();
 
+  /* Kernel properties. */
+  bool kernel_uses_sorting(DeviceKernel kernel);
+  bool kernel_creates_shadow_paths(DeviceKernel kernel);
+  bool kernel_is_shadow_path(DeviceKernel kernel);
+  int kernel_max_active_path_index(DeviceKernel kernel);
+
   /* Integrator queue. */
   unique_ptr<DeviceQueue> queue_;
 
@@ -131,6 +137,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   device_vector<int> integrator_shader_sort_counter_;
   device_vector<int> integrator_shader_raytrace_sort_counter_;
   /* Path split. */
+  device_vector<int> integrator_next_shadow_path_index_;
   device_vector<int> integrator_next_shadow_catcher_path_index_;
 
   /* Temporary buffer to get an array of queued path for a particular kernel. */
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index e0d48361650..7357c5804ed 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -236,6 +236,7 @@ set(SRC_INTEGRATOR_HEADERS
   integrator/integrator_shade_shadow.h
   integrator/integrator_shade_surface.h
   integrator/integrator_shade_volume.h
+  integrator/integrator_shadow_state_template.h
   integrator/integrator_state.h
   integrator/integrator_state_flow.h
   integrator/integrator_state_template.h
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index bf8667ac045..2b0eea4fb61 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -69,6 +69,18 @@ CCL_NAMESPACE_BEGIN
 #  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
 #endif
 
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
 #define DEFINE_INTEGRATOR_KERNEL(name) \
   void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state) \
@@ -83,30 +95,32 @@ CCL_NAMESPACE_BEGIN
     KERNEL_INVOKE(name, kg, state, render_buffer); \
   }
 
-/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
- * that it does not contain unused fields. */
-#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    KernelWorkTile *tile, \
-                                                    ccl_global float *render_buffer) \
+#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state) \
   { \
-    return KERNEL_INVOKE( \
-        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+    KERNEL_INVOKE(name, kg, &state->shadow); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
   }
 
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
 DEFINE_INTEGRATOR_KERNEL(intersect_closest)
-DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
 DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
 DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
-DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
 DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b5ecab2a4db..6b4d79ed5b7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -265,8 +265,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
       });
 }
 
@@ -278,8 +277,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
       });
 }
 
@@ -303,9 +301,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               ((INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-                (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0));
+        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
       });
 }
 
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
index 317ea76553a..ef8dcb50115 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -136,13 +136,6 @@ ccl_device_forceinline void integrator_intersect_shader_next_kernel(
     else {
       INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
     }
-
-    /* If the split happened after bounce through a transparent object it's possible to have shadow
-     * patch. Make sure it is properly re-scheduled on the split path. */
-    const int shadow_kernel = INTEGRATOR_STATE(state, shadow_path, queued_kernel);
-    if (shadow_kernel != 0) {
-      INTEGRATOR_SHADOW_PATH_INIT(shadow_kernel);
-    }
   }
 #endif
 }
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
index 06f58f88bc8..9dc0eb02c9b 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Visibility for the shadow ray. */
 ccl_device_forceinline uint integrate_intersect_shadow_visibility(KernelGlobals kg,
-                                                                  ConstIntegratorState state)
+                                                                  ConstIntegratorShadowState state)
 {
   uint visibility = PATH_RAY_SHADOW;
 
@@ -33,7 +33,7 @@ ccl_device_forceinline uint integrate_intersect_shadow_visibility(KernelGlobals
 }
 
 ccl_device bool integrate_intersect_shadow_opaque(KernelGlobals kg,
-                                                  IntegratorState state,
+                                                  IntegratorShadowState state,
                                                   ccl_private const Ray *ray,
                                                   const uint visibility)
 {
@@ -55,7 +55,7 @@ ccl_device bool integrate_intersect_shadow_opaque(KernelGlobals kg,
 }
 
 ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals kg,
-                                                                 ConstIntegratorState state)
+                                                                 ConstIntegratorShadowState state)
 {
   const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
   const int transparent_bounce = INTEGRATOR_STATE(state, shadow_path, transparent_bounce);
@@ -65,7 +65,7 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k
 
 #ifdef __TRANSPARENT_SHADOWS__
 ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
-                                                       IntegratorState state,
+                                                       IntegratorShadowState state,
                                                        ccl_private const Ray *ray,
                                                        const uint visibility)
 {
@@ -106,7 +106,7 @@ ccl_device bool integrate_intersect_shadow_transparent(KernelGlobals kg,
 }
 #endif
 
-ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorState state)
+ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
 {
   PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW);
 
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
index a3b2b1f9e90..6e3220aa3b7 100644
--- a/intern/cycles/kernel/integrator/integrator_megakernel.h
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -39,14 +39,17 @@ ccl_device void integrator_megakernel(KernelGlobals kg,
    * TODO: investigate if we can use device side enqueue for GPUs to avoid
    * having to compile this big kernel. */
   while (true) {
-    if (INTEGRATOR_STATE(state, shadow_path, queued_kernel)) {
+    const uint32_t shadow_queued_kernel = INTEGRATOR_STATE(
+        &state->shadow, shadow_path, queued_kernel);
+
+    if (shadow_queued_kernel) {
       /* First handle any shadow paths before we potentially create more shadow paths. */
-      switch (INTEGRATOR_STATE(state, shadow_path, queued_kernel)) {
+      switch (shadow_queued_kernel) {
         case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
-          integrator_intersect_shadow(kg, state);
+          integrator_intersect_shadow(kg, &state->shadow);
           break;
         case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
-          integrator_shade_shadow(kg, state, render_buffer);
+          integrator_shade_shadow(kg, &state->shadow, render_buffer);
           break;
         default:
           kernel_assert(0);
diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
index cdbe85f6b8c..94900754b76 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_shadow.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
@@ -30,7 +30,7 @@ ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits)
 
 #ifdef __TRANSPARENT_SHADOWS__
 ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
-                                                              IntegratorState state,
+                                                              IntegratorShadowState state,
                                                               const int hit)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
@@ -69,7 +69,7 @@ ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
 
 #  ifdef __VOLUME__
 ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
-                                                           IntegratorState state,
+                                                           IntegratorShadowState state,
                                                            const int hit,
                                                            const int num_recorded_hits,
                                                            ccl_private float3 *ccl_restrict
@@ -97,14 +97,14 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
   shader_setup_from_volume(kg, shadow_sd, &ray);
 
   const float step_size = volume_stack_step_size(
-      kg, state, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });
+      kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });
 
   volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size);
 }
 #  endif
 
 ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
-                                                    IntegratorState state,
+                                                    IntegratorShadowState state,
                                                     const int num_hits)
 {
   /* Accumulate shadow for transparent surfaces. */
@@ -158,7 +158,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
 #endif /* __TRANSPARENT_SHADOWS__ */
 
 ccl_device void integrator_shade_shadow(KernelGlobals kg,
-                                        IntegratorState state,
+                                        IntegratorShadowState state,
                                         ccl_global float *ccl_restrict render_buffer)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP);
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
index 08580645984..0108ba1373c 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -167,17 +167,20 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray);
   const bool is_light = light_sample_is_light(&ls);
 
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+
   /* Copy volume stack and enter/exit volume. */
-  integrator_state_copy_volume_stack_to_shadow(kg, state);
+  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
 
   if (is_transmission) {
 #  ifdef __VOLUME__
-    shadow_volume_stack_enter_exit(kg, state, sd);
+    shadow_volume_stack_enter_exit(kg, shadow_state, sd);
 #  endif
   }
 
   /* Write shadow ray and associated state to global memory. */
-  integrator_state_write_shadow_ray(kg, state, &ray);
+  integrator_state_write_shadow_ray(kg, shadow_state, &ray);
 
   /* Copy state from main path to shadow path. */
   const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
@@ -191,20 +194,32 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
     const float3 diffuse_glossy_ratio = (bounce == 0) ?
                                             bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
                                             INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
-    INTEGRATOR_STATE_WRITE(state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
   }
 
-  INTEGRATOR_STATE_WRITE(state, shadow_path, flag) = shadow_flag;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, bounce) = bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, transparent_bounce) = transparent_bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, throughput) = throughput;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+      state, path, render_pixel_index);
+  INTEGRATOR_STATE_WRITE(
+      shadow_state, shadow_path, rng_offset) = INTEGRATOR_STATE(state, path, rng_offset) -
+                                               PRNG_BOUNCE_NUM * transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+      state, path, rng_hash);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+      state, path, sample);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_bounce) = INTEGRATOR_STATE(
+      state, path, diffuse_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, glossy_bounce) = INTEGRATOR_STATE(
+      state, path, glossy_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) = INTEGRATOR_STATE(
+      state, path, transmission_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
-    INTEGRATOR_STATE_WRITE(state, shadow_path, unshadowed_throughput) = throughput;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unshadowed_throughput) = throughput;
   }
-
-  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
 }
 #endif
 
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
index d0dde815b5c..13a5e7bda05 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -71,7 +71,7 @@ typedef struct VolumeShaderCoefficients {
 
 /* Evaluate shader to get extinction coefficient at P. */
 ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
-                                                   IntegratorState state,
+                                                   IntegratorShadowState state,
                                                    ccl_private ShaderData *ccl_restrict sd,
                                                    ccl_private float3 *ccl_restrict extinction)
 {
@@ -187,7 +187,7 @@ ccl_device void volume_shadow_homogeneous(KernelGlobals kg, IntegratorState stat
 /* heterogeneous volume: integrate stepping through the volume until we
  * reach the end, get absorbed entirely, or run out of iterations */
 ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
-                                            IntegratorState state,
+                                            IntegratorShadowState state,
                                             ccl_private Ray *ccl_restrict ray,
                                             ccl_private ShaderData *ccl_restrict sd,
                                             ccl_private float3 *ccl_restrict throughput,
@@ -775,8 +775,11 @@ ccl_device_forceinline void integrate_volume_direct_light(
   light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
   const bool is_light = light_sample_is_light(ls);
 
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+
   /* Write shadow ray and associated state to global memory. */
-  integrator_state_write_shadow_ray(kg, state, &ray);
+  integrator_state_write_shadow_ray(kg, shadow_state, &ray);
 
   /* Copy state from main path to shadow path. */
   const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
@@ -790,22 +793,34 @@ ccl_device_forceinline void integrate_volume_direct_light(
     const float3 diffuse_glossy_ratio = (bounce == 0) ?
                                             one_float3() :
                                             INTEGRATOR_STATE(state, path, diffuse_glossy_ratio);
-    INTEGRATOR_STATE_WRITE(state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
   }
 
-  INTEGRATOR_STATE_WRITE(state, shadow_path, flag) = shadow_flag;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, bounce) = bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, transparent_bounce) = transparent_bounce;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, throughput) = throughput_phase;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
+      state, path, render_pixel_index);
+  INTEGRATOR_STATE_WRITE(
+      shadow_state, shadow_path, rng_offset) = INTEGRATOR_STATE(state, path, rng_offset) -
+                                               PRNG_BOUNCE_NUM * transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, rng_hash) = INTEGRATOR_STATE(
+      state, path, rng_hash);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, sample) = INTEGRATOR_STATE(
+      state, path, sample);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, diffuse_bounce) = INTEGRATOR_STATE(
+      state, path, diffuse_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, glossy_bounce) = INTEGRATOR_STATE(
+      state, path, glossy_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) = INTEGRATOR_STATE(
+      state, path, transmission_bounce);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput_phase;
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
-    INTEGRATOR_STATE_WRITE(state, shadow_path, unshadowed_throughput) = throughput;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unshadowed_throughput) = throughput;
   }
 
-  integrator_state_copy_volume_stack_to_shadow(kg, state);
-
-  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
 }
 #  endif
 
@@ -902,7 +917,7 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
 
   /* Step through volume. */
   const float step_size = volume_stack_step_size(
-      kg, state, [=](const int i) { return integrator_state_read_volume_stack(state, i); });
+      kg, [=](const int i) { return integrator_state_read_volume_stack(state, i); });
 
   /* TODO: expensive to zero closures? */
   VolumeIntegrateResult result = {};
diff --git a/intern/cycles/kernel/integrator/integrator_shadow_state_template.h b/intern/cycles/kernel/integrator/integrator_shadow_state_template.h
new file mode 100644
index 00000000000..bc35b644ee1
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shadow_state_template.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/********************************* Shadow Path State **************************/
+
+KERNEL_STRUCT_BEGIN(shadow_path)
+/* Index of a pixel within the device render buffer. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
+/* Current sample number. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
+/* Random number generator seed. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
+/* Random number dimension offset. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current diffuse ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current glossy ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transmission ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput for shadow pass. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Number of intersections found by ray-tracing. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_path)
+
+/********************************** Shadow Ray *******************************/
+
+KERNEL_STRUCT_BEGIN(shadow_ray)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_ray)
+
+/*********************** Shadow Intersection result **************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(shadow_isect)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END_ARRAY(shadow_isect,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_CPU,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_GPU)
+
+/**************************** Shadow Volume Stack *****************************/
+
+KERNEL_STRUCT_BEGIN(shadow_volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index 3aab456a021..84f34c6b986 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -66,6 +66,25 @@ CCL_NAMESPACE_BEGIN
 /* Integrator State
  *
  * CPU rendering path state with AoS layout. */
+typedef struct IntegratorShadowStateCPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
+  } \
+  name[cpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
+#include "kernel/integrator/integrator_shadow_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+} IntegratorShadowStateCPU;
+
 typedef struct IntegratorStateCPU {
 #define KERNEL_STRUCT_BEGIN(name) struct {
 #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
@@ -84,6 +103,8 @@ typedef struct IntegratorStateCPU {
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+
+  IntegratorShadowStateCPU shadow;
 } IntegratorStateCPU;
 
 /* Path Queue
@@ -108,7 +129,11 @@ typedef struct IntegratorStateGPU {
   } \
   name[gpu_size];
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@@ -122,7 +147,10 @@ typedef struct IntegratorStateGPU {
   /* Count number of kernels queued for specific shaders. */
   ccl_global int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
 
-  /* Index of path which will be used by a next shadow catcher split.  */
+  /* Index of shadow path which will be used by a next shadow path.  */
+  ccl_global int *next_shadow_path_index;
+
+  /* Index of main path which will be used by a next shadow catcher split.  */
   ccl_global int *next_shadow_catcher_path_index;
 } IntegratorStateGPU;
 
@@ -140,6 +168,8 @@ typedef struct IntegratorStateGPU {
 
 typedef IntegratorStateCPU *ccl_restrict IntegratorState;
 typedef const IntegratorStateCPU *ccl_restrict ConstIntegratorState;
+typedef IntegratorShadowStateCPU *ccl_restrict IntegratorShadowState;
+typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;
 
 #  define INTEGRATOR_STATE_NULL nullptr
 
@@ -157,6 +187,8 @@ typedef const IntegratorStateCPU *ccl_restrict ConstIntegratorState;
 
 typedef const int IntegratorState;
 typedef const int ConstIntegratorState;
+typedef const int IntegratorShadowState;
+typedef const int ConstIntegratorShadowState;
 
 #  define INTEGRATOR_STATE_NULL -1
 
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
index 9829da875eb..df8fb5e0e46 100644
--- a/intern/cycles/kernel/integrator/integrator_state_flow.h
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -63,10 +63,12 @@ CCL_NAMESPACE_BEGIN
         &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
     INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 
-#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+    IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
+        &kernel_integrator_state.next_shadow_path_index[0], 1); \
     atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
                                 1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
 #  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
     atomic_fetch_and_sub_uint32( \
         &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
@@ -127,8 +129,9 @@ CCL_NAMESPACE_BEGIN
       (void)current_kernel; \
     }
 
-#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel) \
+    IntegratorShadowState shadow_state = &state->shadow; \
+    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
 #  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
     { \
       INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; \
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
index d9801574d4f..b1a6fd36fae 100644
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -28,6 +28,8 @@ KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRA
 KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
 /* Current ray bounce depth. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* Current diffuse ray bounce depth. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* Current glossy ray bounce depth. */
@@ -38,8 +40,6 @@ KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TR
 KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* Current volume bounds ray bounce depth. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING)
-/* Current transparent ray bounce depth. */
-KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
 /* DeviceKernel bit indicating queued kernels. */
 KERNEL_STRUCT_MEMBER(path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
 /* Random number generator seed. */
@@ -107,57 +107,3 @@ KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_END_ARRAY(volume_stack,
                         KERNEL_STRUCT_VOLUME_STACK_SIZE,
                         KERNEL_STRUCT_VOLUME_STACK_SIZE)
-
-/********************************* Shadow Path State **************************/
-
-KERNEL_STRUCT_BEGIN(shadow_path)
-/* Current ray bounce depth. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
-/* Current transparent ray bounce depth. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
-/* DeviceKernel bit indicating queued kernels. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
-/* enum PathRayFlag */
-KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
-/* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
-/* Throughput for shadow pass. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
-/* Ratio of throughput to distinguish diffuse and glossy render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
-/* Number of intersections found by ray-tracing. */
-KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END(shadow_path)
-
-/********************************** Shadow Ray *******************************/
-
-KERNEL_STRUCT_BEGIN(shadow_ray)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END(shadow_ray)
-
-/*********************** Shadow Intersection result **************************/
-
-/* Result from scene intersection. */
-KERNEL_STRUCT_BEGIN(shadow_isect)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END_ARRAY(shadow_isect,
-                        INTEGRATOR_SHADOW_ISECT_SIZE_CPU,
-                        INTEGRATOR_SHADOW_ISECT_SIZE_GPU)
-
-/**************************** Shadow Volume Stack *****************************/
-
-KERNEL_STRUCT_BEGIN(shadow_volume_stack)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
-                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
-                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
index dacc21e6eeb..5bcb9cc2d67 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -50,7 +50,7 @@ ccl_device_forceinline void integrator_state_read_ray(KernelGlobals kg,
 /* Shadow Ray */
 
 ccl_device_forceinline void integrator_state_write_shadow_ray(
-    KernelGlobals kg, IntegratorState state, ccl_private const Ray *ccl_restrict ray)
+    KernelGlobals kg, IntegratorShadowState state, ccl_private const Ray *ccl_restrict ray)
 {
   INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray->P;
   INTEGRATOR_STATE_WRITE(state, shadow_ray, D) = ray->D;
@@ -60,7 +60,7 @@ ccl_device_forceinline void integrator_state_write_shadow_ray(
 }
 
 ccl_device_forceinline void integrator_state_read_shadow_ray(KernelGlobals kg,
-                                                             ConstIntegratorState state,
+                                                             ConstIntegratorShadowState state,
                                                              ccl_private Ray *ccl_restrict ray)
 {
   ray->P = INTEGRATOR_STATE(state, shadow_ray, P);
@@ -122,7 +122,9 @@ ccl_device_forceinline bool integrator_state_volume_stack_is_empty(KernelGlobals
 /* Shadow Intersection */
 
 ccl_device_forceinline void integrator_state_write_shadow_isect(
-    IntegratorState state, ccl_private const Intersection *ccl_restrict isect, const int index)
+    IntegratorShadowState state,
+    ccl_private const Intersection *ccl_restrict isect,
+    const int index)
 {
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, index, t) = isect->t;
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, index, u) = isect->u;
@@ -133,7 +135,9 @@ ccl_device_forceinline void integrator_state_write_shadow_isect(
 }
 
 ccl_device_forceinline void integrator_state_read_shadow_isect(
-    ConstIntegratorState state, ccl_private Intersection *ccl_restrict isect, const int index)
+    ConstIntegratorShadowState state,
+    ccl_private Intersection *ccl_restrict isect,
+    const int index)
 {
   isect->prim = INTEGRATOR_STATE_ARRAY(state, shadow_isect, index, prim);
   isect->object = INTEGRATOR_STATE_ARRAY(state, shadow_isect, index, object);
@@ -143,8 +147,8 @@ ccl_device_forceinline void integrator_state_read_shadow_isect(
   isect->t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, index, t);
 }
 
-ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(KernelGlobals kg,
-                                                                         IntegratorState state)
+ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(
+    KernelGlobals kg, IntegratorShadowState shadow_state, ConstIntegratorState state)
 {
   if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
     int index = 0;
@@ -152,9 +156,9 @@ ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(KernelG
     do {
       shader = INTEGRATOR_STATE_ARRAY(state, volume_stack, index, shader);
 
-      INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_volume_stack, index, object) =
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_volume_stack, index, object) =
           INTEGRATOR_STATE_ARRAY(state, volume_stack, index, object);
-      INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_volume_stack, index, shader) = shader;
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_state, shadow_volume_stack, index, shader) = shader;
 
       ++index;
     } while (shader != OBJECT_NONE);
@@ -181,7 +185,7 @@ ccl_device_forceinline void integrator_state_copy_volume_stack(KernelGlobals kg,
 }
 
 ccl_device_forceinline VolumeStack
-integrator_state_read_shadow_volume_stack(ConstIntegratorState state, int i)
+integrator_state_read_shadow_volume_stack(ConstIntegratorShadowState state, int i)
 {
   VolumeStack entry = {INTEGRATOR_STATE_ARRAY(state, shadow_volume_stack, i, object),
                        INTEGRATOR_STATE_ARRAY(state, shadow_volume_stack, i, shader)};
@@ -189,14 +193,14 @@ integrator_state_read_shadow_volume_stack(ConstIntegratorState state, int i)
 }
 
 ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty(
-    KernelGlobals kg, ConstIntegratorState state)
+    KernelGlobals kg, ConstIntegratorShadowState state)
 {
   return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
              INTEGRATOR_STATE_ARRAY(state, shadow_volume_stack, 0, shader) == SHADER_NONE :
              true;
 }
 
-ccl_device_forceinline void integrator_state_write_shadow_volume_stack(IntegratorState state,
+ccl_device_forceinline void integrator_state_write_shadow_volume_stack(IntegratorShadowState state,
                                                                        int i,
                                                                        VolumeStack entry)
 {
@@ -259,7 +263,6 @@ ccl_device_inline void integrator_state_move(KernelGlobals kg,
   integrator_state_copy_only(kg, to_state, state);
 
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
 }
 
 #endif
@@ -270,12 +273,11 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
                                                              IntegratorState state)
 {
 #if defined(__KERNEL_GPU__)
-  const IntegratorState to_state = atomic_fetch_and_add_uint32(
+  ConstIntegratorState to_state = atomic_fetch_and_add_uint32(
       &kernel_integrator_state.next_shadow_catcher_path_index[0], 1);
 
   integrator_state_copy_only(kg, to_state, state);
 #else
-
   IntegratorStateCPU *ccl_restrict to_state = state + 1;
 
   /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
@@ -283,10 +285,99 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg,
   to_state->ray = state->ray;
   to_state->isect = state->isect;
   integrator_state_copy_volume_stack(kg, to_state, state);
-  to_state->shadow_path = state->shadow_path;
 #endif
 
   INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS;
 }
 
+#ifdef __KERNEL_CPU__
+ccl_device_inline int integrator_state_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, bounce);
+}
+
+ccl_device_inline int integrator_state_bounce(ConstIntegratorShadowState state, const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, bounce);
+}
+
+ccl_device_inline int integrator_state_diffuse_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, diffuse_bounce);
+}
+
+ccl_device_inline int integrator_state_diffuse_bounce(ConstIntegratorShadowState state, const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, diffuse_bounce);
+}
+
+ccl_device_inline int integrator_state_glossy_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, glossy_bounce);
+}
+
+ccl_device_inline int integrator_state_glossy_bounce(ConstIntegratorShadowState state, const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, glossy_bounce);
+}
+
+ccl_device_inline int integrator_state_transmission_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, transmission_bounce);
+}
+
+ccl_device_inline int integrator_state_transmission_bounce(ConstIntegratorShadowState state,
+                                                           const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, transmission_bounce);
+}
+
+ccl_device_inline int integrator_state_transparent_bounce(ConstIntegratorState state, const int)
+{
+  return INTEGRATOR_STATE(state, path, transparent_bounce);
+}
+
+ccl_device_inline int integrator_state_transparent_bounce(ConstIntegratorShadowState state,
+                                                          const int)
+{
+  return INTEGRATOR_STATE(state, shadow_path, transparent_bounce);
+}
+#else
+ccl_device_inline int integrator_state_bounce(ConstIntegratorShadowState state,
+                                              const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, bounce) :
+                                         INTEGRATOR_STATE(state, path, bounce);
+}
+
+ccl_device_inline int integrator_state_diffuse_bounce(ConstIntegratorShadowState state,
+                                                      const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, diffuse_bounce) :
+                                         INTEGRATOR_STATE(state, path, diffuse_bounce);
+}
+
+ccl_device_inline int integrator_state_glossy_bounce(ConstIntegratorShadowState state,
+                                                     const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, glossy_bounce) :
+                                         INTEGRATOR_STATE(state, path, glossy_bounce);
+}
+
+ccl_device_inline int integrator_state_transmission_bounce(ConstIntegratorShadowState state,
+                                                           const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ?
+             INTEGRATOR_STATE(state, shadow_path, transmission_bounce) :
+             INTEGRATOR_STATE(state, path, transmission_bounce);
+}
+
+ccl_device_inline int integrator_state_transparent_bounce(ConstIntegratorShadowState state,
+                                                          const uint32_t path_flag)
+{
+  return (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, transparent_bounce) :
+                                         INTEGRATOR_STATE(state, path, transparent_bounce);
+}
+#endif
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h
index e3a4546508f..cf69826ffff 100644
--- a/intern/cycles/kernel/integrator/integrator_volume_stack.h
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@@ -98,7 +98,7 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
 }
 
 ccl_device void shadow_volume_stack_enter_exit(KernelGlobals kg,
-                                               IntegratorState state,
+                                               IntegratorShadowState state,
                                                ccl_private const ShaderData *sd)
 {
   volume_stack_enter_exit(
@@ -136,9 +136,7 @@ ccl_device_inline void volume_stack_clean(KernelGlobals kg, IntegratorState stat
 }
 
 template<typename StackReadOp>
-ccl_device float volume_stack_step_size(KernelGlobals kg,
-                                        IntegratorState state,
-                                        StackReadOp stack_read)
+ccl_device float volume_stack_step_size(KernelGlobals kg, StackReadOp stack_read)
 {
   float step_size = FLT_MAX;
 
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 5f32150d33c..848aaa18aae 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -393,17 +393,20 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(KernelGlobals kg
 
 /* Write light contribution to render buffer. */
 ccl_device_inline void kernel_accum_light(KernelGlobals kg,
-                                          ConstIntegratorState state,
+                                          ConstIntegratorShadowState state,
                                           ccl_global float *ccl_restrict render_buffer)
 {
   /* The throughput for shadow paths already contains the light shader evaluation. */
   float3 contribution = INTEGRATOR_STATE(state, shadow_path, throughput);
   kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, shadow_path, bounce));
 
-  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, shadow_path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
 
   const uint32_t path_flag = INTEGRATOR_STATE(state, shadow_path, flag);
-  const int sample = INTEGRATOR_STATE(state, path, sample);
+  const int sample = INTEGRATOR_STATE(state, shadow_path, sample);
 
   kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
 
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 66eb468fdca..fa8de14916e 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -26,7 +26,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init_queues(IntegratorState state)
 {
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+#ifdef __KERNEL_CPU__
+  INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
+#endif
 }
 
 /* Minimalistic initialization of the path state, which is needed for early outputs in the
@@ -293,16 +295,15 @@ ccl_device_inline void path_state_rng_load(ConstIntegratorState state,
   rng_state->sample = INTEGRATOR_STATE(state, path, sample);
 }
 
-ccl_device_inline void shadow_path_state_rng_load(ConstIntegratorState state,
+ccl_device_inline void shadow_path_state_rng_load(ConstIntegratorShadowState state,
                                                   ccl_private RNGState *rng_state)
 {
-  const uint shadow_bounces = INTEGRATOR_STATE(state, shadow_path, transparent_bounce) -
-                              INTEGRATOR_STATE(state, path, transparent_bounce);
+  const uint shadow_bounces = INTEGRATOR_STATE(state, shadow_path, transparent_bounce);
 
-  rng_state->rng_hash = INTEGRATOR_STATE(state, path, rng_hash);
-  rng_state->rng_offset = INTEGRATOR_STATE(state, path, rng_offset) +
+  rng_state->rng_hash = INTEGRATOR_STATE(state, shadow_path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(state, shadow_path, rng_offset) +
                           PRNG_BOUNCE_NUM * shadow_bounces;
-  rng_state->sample = INTEGRATOR_STATE(state, path, sample);
+  rng_state->sample = INTEGRATOR_STATE(state, shadow_path, sample);
 }
 
 ccl_device_inline float path_state_rng_1D(KernelGlobals kg,
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 4a5a5309c61..d25191b72cf 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -575,9 +575,9 @@ ccl_device float3 shader_holdout_apply(KernelGlobals kg, ccl_private ShaderData
 
 /* Surface Evaluation */
 
-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device void shader_eval_surface(KernelGlobals kg,
-                                    ConstIntegratorState state,
+                                    ConstIntegratorGenericState state,
                                     ccl_private ShaderData *ccl_restrict sd,
                                     ccl_global float *ccl_restrict buffer,
                                     uint32_t path_flag)
@@ -753,9 +753,9 @@ ccl_device int shader_phase_sample_closure(KernelGlobals kg,
 
 /* Volume Evaluation */
 
-template<const bool shadow, typename StackReadOp>
+template<const bool shadow, typename StackReadOp, typename ConstIntegratorGenericState>
 ccl_device_inline void shader_eval_volume(KernelGlobals kg,
-                                          ConstIntegratorState state,
+                                          ConstIntegratorGenericState state,
                                           ccl_private ShaderData *ccl_restrict sd,
                                           const uint32_t path_flag,
                                           StackReadOp stack_read)
@@ -831,8 +831,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals kg,
 
 /* Displacement Evaluation */
 
+template<typename ConstIntegratorGenericState>
 ccl_device void shader_eval_displacement(KernelGlobals kg,
-                                         ConstIntegratorState state,
+                                         ConstIntegratorGenericState state,
                                          ccl_private ShaderData *sd)
 {
   sd->num_closure = 0;
diff --git a/intern/cycles/kernel/kernel_shadow_catcher.h b/intern/cycles/kernel/kernel_shadow_catcher.h
index 00dddb5b198..9bed140b395 100644
--- a/intern/cycles/kernel/kernel_shadow_catcher.h
+++ b/intern/cycles/kernel/kernel_shadow_catcher.h
@@ -62,7 +62,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals
 ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
                                                             ConstIntegratorState state)
 {
-  if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) {
+  if (INTEGRATOR_PATH_IS_TERMINATED) {
     return false;
   }
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 3e276c24cdd..edae158f403 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -812,6 +812,7 @@ typedef struct ccl_align(16) ShaderData
 #ifdef __OSL__
   const struct KernelGlobalsCPU *osl_globals;
   const struct IntegratorStateCPU *osl_path_state;
+  const struct IntegratorShadowStateCPU *osl_shadow_path_state;
 #endif
 
   /* LCG state for closures that require additional random numbers. */
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index bb7655fbe9a..cbe1bf1bfc0 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -1015,31 +1015,44 @@ bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
   else if (name == u_path_ray_depth) {
     /* Ray Depth */
     const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state) ? state->path.bounce : (shadow_state) ? shadow_state->shadow_path.bounce : 0;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_diffuse_depth) {
     /* Diffuse Ray Depth */
     const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.diffuse_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.diffuse_bounce :
+            (shadow_state) ? shadow_state->shadow_path.diffuse_bounce :
+                             0;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_glossy_depth) {
     /* Glossy Ray Depth */
     const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.glossy_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.glossy_bounce :
+            (shadow_state) ? shadow_state->shadow_path.glossy_bounce :
+                             0;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transmission_depth) {
     /* Transmission Ray Depth */
     const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.transmission_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.transmission_bounce :
+            (shadow_state) ? shadow_state->shadow_path.transmission_bounce :
+                             0;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transparent_depth) {
     /* Transparent Ray Depth */
     const IntegratorStateCPU *state = sd->osl_path_state;
-    int f = state->path.transparent_bounce;
+    const IntegratorShadowStateCPU *shadow_state = sd->osl_shadow_path_state;
+    int f = (state)        ? state->path.transparent_bounce :
+            (shadow_state) ? shadow_state->shadow_path.transparent_bounce :
+                             0;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_ndc) {
@@ -1228,34 +1241,38 @@ bool OSLRenderServices::texture(ustring filename,
       /* Bevel shader hack. */
       if (nchannels >= 3) {
         const IntegratorStateCPU *state = sd->osl_path_state;
-        int num_samples = (int)s;
-        float radius = t;
-        float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
-        result[0] = N.x;
-        result[1] = N.y;
-        result[2] = N.z;
-        status = true;
+        if (state) {
+          int num_samples = (int)s;
+          float radius = t;
+          float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
+          result[0] = N.x;
+          result[1] = N.y;
+          result[2] = N.z;
+          status = true;
+        }
       }
       break;
     }
     case OSLTextureHandle::AO: {
       /* AO shader hack. */
       const IntegratorStateCPU *state = sd->osl_path_state;
-      int num_samples = (int)s;
-      float radius = t;
-      float3 N = make_float3(dsdx, dtdx, dsdy);
-      int flags = 0;
-      if ((int)dtdy) {
-        flags |= NODE_AO_INSIDE;
-      }
-      if ((int)options.sblur) {
-        flags |= NODE_AO_ONLY_LOCAL;
-      }
-      if ((int)options.tblur) {
-        flags |= NODE_AO_GLOBAL_RADIUS;
+      if (state) {
+        int num_samples = (int)s;
+        float radius = t;
+        float3 N = make_float3(dsdx, dtdx, dsdy);
+        int flags = 0;
+        if ((int)dtdy) {
+          flags |= NODE_AO_INSIDE;
+        }
+        if ((int)options.sblur) {
+          flags |= NODE_AO_ONLY_LOCAL;
+        }
+        if ((int)options.tblur) {
+          flags |= NODE_AO_GLOBAL_RADIUS;
+        }
+        result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
+        status = true;
       }
-      result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
-      status = true;
       break;
     }
     case OSLTextureHandle::SVM: {
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 4c067e88ab6..fba207e7230 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -89,7 +89,7 @@ void OSLShader::thread_free(KernelGlobalsCPU *kg)
 
 static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
                                         ShaderData *sd,
-                                        const IntegratorStateCPU *state,
+                                        const void *state,
                                         uint32_t path_flag,
                                         OSLThreadData *tdata)
 {
@@ -134,7 +134,12 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
 
   /* Used by render-services. */
   sd->osl_globals = kg;
-  sd->osl_path_state = state;
+  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
+  }
+  else {
+    sd->osl_path_state = (const IntegratorStateCPU *)state;
+  }
 }
 
 /* Surface */
@@ -175,7 +180,7 @@ static void flatten_surface_closure_tree(ShaderData *sd,
 }
 
 void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
-                             const IntegratorStateCPU *state,
+                             const void *state,
                              ShaderData *sd,
                              uint32_t path_flag)
 {
@@ -283,7 +288,7 @@ static void flatten_background_closure_tree(ShaderData *sd,
 }
 
 void OSLShader::eval_background(const KernelGlobalsCPU *kg,
-                                const IntegratorStateCPU *state,
+                                const void *state,
                                 ShaderData *sd,
                                 uint32_t path_flag)
 {
@@ -341,7 +346,7 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 }
 
 void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
-                            const IntegratorStateCPU *state,
+                            const void *state,
                             ShaderData *sd,
                             uint32_t path_flag)
 {
@@ -366,9 +371,7 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg,
-                                  const IntegratorStateCPU *state,
-                                  ShaderData *sd)
+void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 2b3810b0a33..037a18a1f19 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -55,20 +55,18 @@ class OSLShader {
 
   /* eval */
   static void eval_surface(const KernelGlobalsCPU *kg,
-                           const IntegratorStateCPU *state,
+                           const void *state,
                            ShaderData *sd,
                            uint32_t path_flag);
   static void eval_background(const KernelGlobalsCPU *kg,
-                              const IntegratorStateCPU *state,
+                              const void *state,
                               ShaderData *sd,
                               uint32_t path_flag);
   static void eval_volume(const KernelGlobalsCPU *kg,
-                          const IntegratorStateCPU *state,
+                          const void *state,
                           ShaderData *sd,
                           uint32_t path_flag);
-  static void eval_displacement(const KernelGlobalsCPU *kg,
-                                const IntegratorStateCPU *state,
-                                ShaderData *sd);
+  static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);
 
   /* attributes */
   static int find_attribute(const KernelGlobalsCPU *kg,
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 57879dc238f..472f3517839 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -225,9 +225,9 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
-template<uint node_feature_mask, ShaderType type>
+template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
 ccl_device void svm_eval_nodes(KernelGlobals kg,
-                               ConstIntegratorState state,
+                               ConstIntegratorGenericState state,
                                ShaderData *sd,
                                ccl_global float *render_buffer,
                                uint32_t path_flag)
diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h
index a1efd2f0a43..4cfef7bc204 100644
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@@ -21,17 +21,17 @@ CCL_NAMESPACE_BEGIN
 #ifdef __SHADER_RAYTRACE__
 
 #  ifdef __KERNEL_OPTIX__
-extern "C" __device__ float __direct_callable__svm_node_ao(KernelGlobals kg,
-                                                           ConstIntegratorState state,
+extern "C" __device__ float __direct_callable__svm_node_ao(
 #  else
-ccl_device float svm_ao(KernelGlobals kg,
-                        ConstIntegratorState state,
+ccl_device float svm_ao(
 #  endif
-                                                           ccl_private ShaderData *sd,
-                                                           float3 N,
-                                                           float max_dist,
-                                                           int num_samples,
-                                                           int flags)
+    KernelGlobals kg,
+    ConstIntegratorState state,
+    ccl_private ShaderData *sd,
+    float3 N,
+    float max_dist,
+    int num_samples,
+    int flags)
 {
   if (flags & NODE_AO_GLOBAL_RADIUS) {
     max_dist = kernel_data.integrator.ao_bounces_distance;
@@ -91,7 +91,7 @@ ccl_device float svm_ao(KernelGlobals kg,
   return ((float)unoccluded) / num_samples;
 }
 
-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 #  if defined(__KERNEL_OPTIX__)
 ccl_device_inline
 #  else
@@ -99,7 +99,7 @@ ccl_device_noinline
 #  endif
     void
     svm_node_ao(KernelGlobals kg,
-                ConstIntegratorState state,
+                ConstIntegratorGenericState state,
                 ccl_private ShaderData *sd,
                 ccl_private float *stack,
                 uint4 node)
diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h
index 0d6395d52c0..833a6443b3c 100644
--- a/intern/cycles/kernel/svm/svm_aov.h
+++ b/intern/cycles/kernel/svm/svm_aov.h
@@ -26,9 +26,9 @@ ccl_device_inline bool svm_node_aov_check(const uint32_t path_flag,
   return ((render_buffer != NULL) && is_primary);
 }
 
-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device void svm_node_aov_color(KernelGlobals kg,
-                                   ConstIntegratorState state,
+                                   ConstIntegratorGenericState state,
                                    ccl_private ShaderData *sd,
                                    ccl_private float *stack,
                                    uint4 node,
@@ -46,9 +46,9 @@ ccl_device void svm_node_aov_color(KernelGlobals kg,
   }
 }
 
-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device void svm_node_aov_value(KernelGlobals kg,
-                                   ConstIntegratorState state,
+                                   ConstIntegratorGenericState state,
                                    ccl_private ShaderData *sd,
                                    ccl_private float *stack,
                                    uint4 node,
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index 3ce3af20795..292887beedf 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -99,15 +99,15 @@ ccl_device void svm_bevel_cubic_sample(const float radius,
  */
 
 #  ifdef __KERNEL_OPTIX__
-extern "C" __device__ float3 __direct_callable__svm_node_bevel(KernelGlobals kg,
-                                                               ConstIntegratorState state,
+extern "C" __device__ float3 __direct_callable__svm_node_bevel(
 #  else
-ccl_device float3 svm_bevel(KernelGlobals kg,
-                            ConstIntegratorState state,
+ccl_device float3 svm_bevel(
 #  endif
-                                                               ccl_private ShaderData *sd,
-                                                               float radius,
-                                                               int num_samples)
+    KernelGlobals kg,
+    ConstIntegratorState state,
+    ccl_private ShaderData *sd,
+    float radius,
+    int num_samples)
 {
   /* Early out if no sampling needed. */
   if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
@@ -282,7 +282,7 @@ ccl_device float3 svm_bevel(KernelGlobals kg,
   return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
 }
 
-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 #  if defined(__KERNEL_OPTIX__)
 ccl_device_inline
 #  else
@@ -290,7 +290,7 @@ ccl_device_noinline
 #  endif
     void
     svm_node_bevel(KernelGlobals kg,
-                   ConstIntegratorState state,
+                   ConstIntegratorGenericState state,
                    ccl_private ShaderData *sd,
                    ccl_private float *stack,
                    uint4 node)
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index c61ace9757a..5e1fc4f671c 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -18,9 +18,9 @@ CCL_NAMESPACE_BEGIN
 
 /* Light Path Node */
 
-template<uint node_feature_mask>
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
 ccl_device_noinline void svm_node_light_path(KernelGlobals kg,
-                                             ConstIntegratorState state,
+                                             ConstIntegratorGenericState state,
                                              ccl_private const ShaderData *sd,
                                              ccl_private float *stack,
                                              uint type,
@@ -64,48 +64,43 @@ ccl_device_noinline void svm_node_light_path(KernelGlobals kg,
       /* Read bounce from difference location depending if this is a shadow
        * path. It's a bit dubious to have integrate state details leak into
        * this function but hard to avoid currently. */
-      int bounce = 0;
       IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
       {
-        bounce = (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(state, shadow_path, bounce) :
-                                                 INTEGRATOR_STATE(state, path, bounce);
+        info = (float)integrator_state_bounce(state, path_flag);
       }
 
       /* For background, light emission and shadow evaluation we from a
        * surface or volume we are effective one bounce further. */
       if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
-        bounce++;
+        info += 1.0f;
       }
-
-      info = (float)bounce;
       break;
     }
-      /* TODO */
     case NODE_LP_ray_transparent: {
-      int bounce = 0;
       IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
       {
-        bounce = (path_flag & PATH_RAY_SHADOW) ?
-                     INTEGRATOR_STATE(state, shadow_path, transparent_bounce) :
-                     INTEGRATOR_STATE(state, path, transparent_bounce);
+        info = (float)integrator_state_transparent_bounce(state, path_flag);
       }
-
-      info = (float)bounce;
       break;
     }
-#if 0
     case NODE_LP_ray_diffuse:
-      info = (float)state->diffuse_bounce;
+      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
+      {
+        info = (float)integrator_state_diffuse_bounce(state, path_flag);
+      }
       break;
     case NODE_LP_ray_glossy:
-      info = (float)state->glossy_bounce;
+      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
+      {
+        info = (float)integrator_state_glossy_bounce(state, path_flag);
+      }
       break;
-#endif
-#if 0
     case NODE_LP_ray_transmission:
-      info = (float)state->transmission_bounce;
+      IF_KERNEL_NODES_FEATURE(LIGHT_PATH)
+      {
+        info = (float)integrator_state_transmission_bounce(state, path_flag);
+      }
       break;
-#endif
   }
 
   stack_store_float(stack, out_offset, info);