1 files changed, 195 insertions, 77 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 02830a00405..36f275e1075 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -52,7 +52,11 @@ static size_t estimate_single_state_size()
  * For until then use common value. Currently this size is only used for logging, but is weak to
  * rely on this. */
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@@ -74,22 +78,25 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
           device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
-      integrator_next_shadow_catcher_path_index_(
-          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+      integrator_shader_sort_prefix_sum_(
+          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
+      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
+      integrator_next_shadow_path_index_(
+          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
       queued_paths_(device, "queued_paths", MEM_READ_WRITE),
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
       display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
       max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
-      min_num_active_paths_(queue_->num_concurrent_busy_states()),
-      max_active_path_index_(0)
+      min_num_active_main_paths_(queue_->num_concurrent_busy_states()),
+      max_active_main_path_index_(0)
 {
   memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
 
   /* Limit number of active paths to the half of the overall state. This is due to the logic in the
    * path compaction which relies on the fact that regeneration does not happen sooner than half of
    * the states are available again. */
-  min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+  min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2);
 }
 
 void PathTraceWorkGPU::alloc_integrator_soa()
@@ -138,7 +145,11 @@ void PathTraceWorkGPU::alloc_integrator_soa()
   } \
   }
 #define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
+
 #include "kernel/integrator/integrator_state_template.h"
+
+#include "kernel/integrator/integrator_shadow_state_template.h"
+
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
@@ -190,6 +201,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
     integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
     integrator_shader_raytrace_sort_counter_.zero_to_device();
 
+    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
+    integrator_shader_sort_prefix_sum_.zero_to_device();
+
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
         (int *)integrator_shader_sort_counter_.device_pointer;
     integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
@@ -199,16 +213,22 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
 
 void PathTraceWorkGPU::alloc_integrator_path_split()
 {
-  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
-    return;
+  if (integrator_next_shadow_path_index_.size() == 0) {
+    integrator_next_shadow_path_index_.alloc(1);
+    integrator_next_shadow_path_index_.zero_to_device();
+
+    integrator_state_gpu_.next_shadow_path_index =
+        (int *)integrator_next_shadow_path_index_.device_pointer;
   }
 
-  integrator_next_shadow_catcher_path_index_.alloc(1);
-  /* TODO(sergey): Use queue? */
-  integrator_next_shadow_catcher_path_index_.zero_to_device();
+  if (integrator_next_main_path_index_.size() == 0) {
+    integrator_next_main_path_index_.alloc(1);
+    integrator_next_shadow_path_index_.data()[0] = 0;
+    integrator_next_main_path_index_.zero_to_device();
 
-  integrator_state_gpu_.next_shadow_catcher_path_index =
-      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+    integrator_state_gpu_.next_main_path_index =
+        (int *)integrator_next_main_path_index_.device_pointer;
+  }
 }
 
 void PathTraceWorkGPU::alloc_work_memory()
@@ -282,7 +302,7 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
       break;
     }
 
-    num_busy_accum += get_num_active_paths();
+    num_busy_accum += num_active_main_paths_paths();
     ++num_iterations;
   }
 
@@ -341,54 +361,65 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
     return false;
   }
 
-  /* Finish shadows before potentially adding more shadow rays. We can only
-   * store one shadow ray in the integrator state.
-   *
-   * When there is a shadow catcher in the scene finish shadow rays before invoking interesect
-   * closest kernel since so that the shadow paths are writing to the pre-split state. */
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
-      (has_shadow_catcher() && kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST)) {
-    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
-      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
-      return true;
+  /* For kernels that add shadow paths, check if there is enough space available.
+   * If not, schedule shadow kernels first to clear out the shadow paths. */
+  int num_paths_limit = INT_MAX;
+
+  if (kernel_creates_shadow_paths(kernel)) {
+    compact_shadow_paths();
+
+    const int available_shadow_paths = max_num_paths_ -
+                                       integrator_next_shadow_path_index_.data()[0];
+    if (available_shadow_paths < queue_counter->num_queued[kernel]) {
+      if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+        enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+        return true;
+      }
+      else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+        enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+        return true;
+      }
     }
-    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
-      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
-      return true;
+    else if (kernel_creates_ao_paths(kernel)) {
+      /* AO kernel creates two shadow paths, so limit number of states to schedule. */
+      num_paths_limit = available_shadow_paths / 2;
     }
   }
 
   /* Schedule kernel with maximum number of queued items. */
-  enqueue_path_iteration(kernel);
+  enqueue_path_iteration(kernel, num_paths_limit);
+
+  /* Update next shadow path index for kernels that can add shadow paths. */
+  if (kernel_creates_shadow_paths(kernel)) {
+    queue_->copy_from_device(integrator_next_shadow_path_index_);
+  }
+
   return true;
 }
 
-void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit)
 {
   void *d_path_index = (void *)NULL;
 
   /* Create array of path indices for which this kernel is queued to be executed. */
-  int work_size = max_active_path_index_;
+  int work_size = kernel_max_active_main_path_index(kernel);
 
   IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
   int num_queued = queue_counter->num_queued[kernel];
 
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+  if (kernel_uses_sorting(kernel)) {
     /* Compute array of active paths, sorted by shader. */
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
 
-    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+    compute_sorted_queued_paths(
+        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
   }
   else if (num_queued < work_size) {
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
 
-    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    if (kernel_is_shadow_path(kernel)) {
       /* Compute array of active shadow paths for specific kernel. */
       compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
     }
@@ -398,6 +429,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     }
   }
 
+  work_size = min(work_size, num_paths_limit);
+
   DCHECK_LE(work_size, max_num_paths_);
 
   switch (kernel) {
@@ -432,17 +465,20 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
   }
 }
 
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
+                                                   DeviceKernel queued_kernel,
+                                                   const int num_paths_limit)
 {
   int d_queued_kernel = queued_kernel;
   void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
-  assert(d_counter != nullptr);
+  void *d_prefix_sum = (void *)integrator_shader_sort_prefix_sum_.device_pointer;
+  assert(d_counter != nullptr && d_prefix_sum != nullptr);
 
   /* Compute prefix sum of number of active paths with each shader. */
   {
     const int work_size = 1;
     int max_shaders = device_scene_->data.max_shaders;
-    void *args[] = {&d_counter, &max_shaders};
+    void *args[] = {&d_counter, &d_prefix_sum, &max_shaders};
     queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
   }
 
@@ -451,29 +487,24 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKe
   /* Launch kernel to fill the active paths arrays. */
   {
     /* TODO: this could be smaller for terminated paths based on amount of work we want
-     * to schedule. */
-    const int work_size = max_active_path_index_;
+     * to schedule, and also based on num_paths_limit.
+     *
+     * Also, when the number paths is limited it may be better to prefer paths from the
+     * end of the array since compaction would need to do less work. */
+    const int work_size = kernel_max_active_main_path_index(queued_kernel);
 
     void *d_queued_paths = (void *)queued_paths_.device_pointer;
     void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
     void *args[] = {const_cast<int *>(&work_size),
+                    const_cast<int *>(&num_paths_limit),
                     &d_queued_paths,
                     &d_num_queued_paths,
                     &d_counter,
+                    &d_prefix_sum,
                     &d_queued_kernel};
 
     queue_->enqueue(kernel, work_size, args);
   }
-
-  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
-    queue_->zero_to_device(integrator_shader_sort_counter_);
-  }
-  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
-    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  }
-  else {
-    assert(0);
-  }
 }
 
 void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
@@ -481,7 +512,7 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu
   int d_queued_kernel = queued_kernel;
 
   /* Launch kernel to fill the active paths arrays. */
-  const int work_size = max_active_path_index_;
+  const int work_size = kernel_max_active_main_path_index(queued_kernel);
   void *d_queued_paths = (void *)queued_paths_.device_pointer;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
   void *args[] = {
@@ -491,18 +522,76 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu
   queue_->enqueue(kernel, work_size, args);
 }
 
-void PathTraceWorkGPU::compact_states(const int num_active_paths)
+void PathTraceWorkGPU::compact_main_paths(const int num_active_paths)
 {
+  /* Early out if there is nothing that needs to be compacted. */
   if (num_active_paths == 0) {
-    max_active_path_index_ = 0;
+    max_active_main_path_index_ = 0;
+    return;
   }
 
-  /* Compact fragmented path states into the start of the array, moving any paths
-   * with index higher than the number of active paths into the gaps. */
-  if (max_active_path_index_ == num_active_paths) {
+  const int min_compact_paths = 32;
+  if (max_active_main_path_index_ == num_active_paths ||
+      max_active_main_path_index_ < min_compact_paths) {
+    return;
+  }
+
+  /* Compact. */
+  compact_paths(num_active_paths,
+                max_active_main_path_index_,
+                DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES);
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_main_path_index_ = num_active_paths;
+}
+
+void PathTraceWorkGPU::compact_shadow_paths()
+{
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  const int num_active_paths =
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] +
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW];
+
+  /* Early out if there is nothing that needs to be compacted. */
+  if (num_active_paths == 0) {
+    if (integrator_next_shadow_path_index_.data()[0] != 0) {
+      integrator_next_shadow_path_index_.data()[0] = 0;
+      queue_->copy_to_device(integrator_next_shadow_path_index_);
+    }
     return;
   }
 
+  /* Compact if we can reduce the space used by half. Not always since
+   * compaction has a cost. */
+  const float shadow_compact_ratio = 0.5f;
+  const int min_compact_paths = 32;
+  if (integrator_next_shadow_path_index_.data()[0] < num_active_paths * shadow_compact_ratio ||
+      integrator_next_shadow_path_index_.data()[0] < min_compact_paths) {
+    return;
+  }
+
+  /* Compact. */
+  compact_paths(num_active_paths,
+                integrator_next_shadow_path_index_.data()[0],
+                DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES);
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  integrator_next_shadow_path_index_.data()[0] = num_active_paths;
+  queue_->copy_to_device(integrator_next_shadow_path_index_);
+}
+
+void PathTraceWorkGPU::compact_paths(const int num_active_paths,
+                                     const int max_active_path_index,
+                                     DeviceKernel terminated_paths_kernel,
+                                     DeviceKernel compact_paths_kernel,
+                                     DeviceKernel compact_kernel)
+{
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
   void *d_compact_paths = (void *)queued_paths_.device_pointer;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
 
@@ -513,17 +602,17 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths)
     int work_size = num_active_paths;
     void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
     queue_->zero_to_device(num_queued_paths_);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+    queue_->enqueue(terminated_paths_kernel, work_size, args);
   }
 
   /* Create array of paths that we need to compact, where the path index is bigger
    * than the number of active paths. */
   {
-    int work_size = max_active_path_index_;
+    int work_size = max_active_path_index;
     void *args[] = {
         &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
     queue_->zero_to_device(num_queued_paths_);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+    queue_->enqueue(compact_paths_kernel, work_size, args);
   }
 
   queue_->copy_from_device(num_queued_paths_);
@@ -538,13 +627,8 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths)
     int terminated_states_offset = num_active_paths;
     void *args[] = {
         &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+    queue_->enqueue(compact_kernel, work_size, args);
   }
-
-  queue_->synchronize();
-
-  /* Adjust max active path index now we know which part of the array is actually used. */
-  max_active_path_index_ = num_active_paths;
 }
 
 bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
@@ -558,7 +642,7 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
     return false;
   }
 
-  int num_active_paths = get_num_active_paths();
+  int num_active_paths = num_active_main_paths_paths();
 
   /* Don't schedule more work if canceling. */
   if (is_cancel_requested()) {
@@ -598,7 +682,7 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
   /* Schedule when we're out of paths or there are too few paths to keep the
    * device occupied. */
   int num_paths = num_active_paths;
-  if (num_paths == 0 || num_paths < min_num_active_paths_) {
+  if (num_paths == 0 || num_paths < min_num_active_main_paths_) {
     /* Get work tiles until the maximum number of path is reached. */
     while (num_paths < max_num_camera_paths) {
       KernelWorkTile work_tile;
@@ -625,11 +709,11 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
 
   /* Compact state array when number of paths becomes small relative to the
    * known maximum path index, which makes computing active index arrays slow. */
-  compact_states(num_active_paths);
+  compact_main_paths(num_active_paths);
 
   if (has_shadow_catcher()) {
-    integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
-    queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+    integrator_next_main_path_index_.data()[0] = num_paths;
+    queue_->copy_to_device(integrator_next_main_path_index_);
   }
 
   enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
@@ -682,12 +766,11 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
 
   queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
 
-  max_active_path_index_ = path_index_offset + num_predicted_splits;
+  max_active_main_path_index_ = path_index_offset + num_predicted_splits;
 }
 
-int PathTraceWorkGPU::get_num_active_paths()
+int PathTraceWorkGPU::num_active_main_paths_paths()
 {
-  /* TODO: this is wrong, does not account for duplicates with shadow! */
   IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
 
   int num_paths = 0;
@@ -695,7 +778,10 @@ int PathTraceWorkGPU::get_num_active_paths()
     DCHECK_GE(queue_counter->num_queued[i], 0)
         << "Invalid number of queued states for kernel "
         << device_kernel_as_string(static_cast<DeviceKernel>(i));
-    num_paths += queue_counter->num_queued[i];
+
+    if (!kernel_is_shadow_path((DeviceKernel)i)) {
+      num_paths += queue_counter->num_queued[i];
+    }
   }
 
   return num_paths;
@@ -960,7 +1046,7 @@ bool PathTraceWorkGPU::has_shadow_catcher() const
 
 int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
 {
-  if (max_active_path_index_ == 0) {
+  if (max_active_main_path_index_ == 0) {
     return 0;
   }
 
@@ -970,7 +1056,7 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
 
   queue_->zero_to_device(num_queued_paths_);
 
-  const int work_size = max_active_path_index_;
+  const int work_size = max_active_main_path_index_;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
   void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
 
@@ -981,4 +1067,36 @@ int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
   return num_queued_paths_.data()[0];
 }
 
+bool PathTraceWorkGPU::kernel_uses_sorting(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
+bool PathTraceWorkGPU::kernel_creates_shadow_paths(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+}
+
+bool PathTraceWorkGPU::kernel_creates_ao_paths(DeviceKernel kernel)
+{
+  return (device_scene_->data.film.pass_ao != PASS_UNUSED) &&
+         (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
+bool PathTraceWorkGPU::kernel_is_shadow_path(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+}
+
+int PathTraceWorkGPU::kernel_max_active_main_path_index(DeviceKernel kernel)
+{
+  return (kernel_is_shadow_path(kernel)) ? integrator_next_shadow_path_index_.data()[0] :
+                                           max_active_main_path_index_;
+}
+
 CCL_NAMESPACE_END