8 files changed, 186 insertions, 30 deletions
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 6b2c9a40082..09352a84181 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -113,6 +113,8 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
       /* See parall_active_index.h for why this amount of shared memory is needed. */
       shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
       break;
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index e0833331b77..1e282aac57e 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -64,6 +64,12 @@ const char *device_kernel_as_string(DeviceKernel kernel)
       return "integrator_compact_paths_array";
     case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
       return "integrator_compact_states";
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+      return "integrator_terminated_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
+      return "integrator_compact_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
+      return "integrator_compact_shadow_states";
     case DEVICE_KERNEL_INTEGRATOR_RESET:
       return "integrator_reset";
     case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index a612f59fb32..0f053ccbeb5 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -113,6 +113,8 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
       /* See parall_active_index.h for why this amount of shared memory is needed. */
       shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
       break;
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 2c71b1cf876..36f275e1075 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -361,26 +361,13 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
     return false;
   }
 
-  /* If the number of shadow kernels dropped to zero, set the next shadow path
-   * index to zero as well.
-   *
-   * TODO: use shadow path compaction to lower it more often instead of letting
-   * it fill up entirely? */
-  const int num_queued_shadow =
-      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] +
-      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW];
-  if (num_queued_shadow == 0) {
-    if (integrator_next_shadow_path_index_.data()[0] != 0) {
-      integrator_next_shadow_path_index_.data()[0] = 0;
-      queue_->copy_to_device(integrator_next_shadow_path_index_);
-    }
-  }
-
   /* For kernels that add shadow paths, check if there is enough space available.
    * If not, schedule shadow kernels first to clear out the shadow paths. */
   int num_paths_limit = INT_MAX;
 
   if (kernel_creates_shadow_paths(kernel)) {
+    compact_shadow_paths();
+
     const int available_shadow_paths = max_num_paths_ -
                                        integrator_next_shadow_path_index_.data()[0];
     if (available_shadow_paths < queue_counter->num_queued[kernel]) {
@@ -535,18 +522,76 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu
   queue_->enqueue(kernel, work_size, args);
 }
 
-void PathTraceWorkGPU::compact_states(const int num_active_paths)
+void PathTraceWorkGPU::compact_main_paths(const int num_active_paths)
 {
+  /* Early out if there is nothing that needs to be compacted. */
   if (num_active_paths == 0) {
     max_active_main_path_index_ = 0;
+    return;
   }
 
-  /* Compact fragmented path states into the start of the array, moving any paths
-   * with index higher than the number of active paths into the gaps. */
-  if (max_active_main_path_index_ == num_active_paths) {
+  const int min_compact_paths = 32;
+  if (max_active_main_path_index_ == num_active_paths ||
+      max_active_main_path_index_ < min_compact_paths) {
+    return;
+  }
+
+  /* Compact. */
+  compact_paths(num_active_paths,
+                max_active_main_path_index_,
+                DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES);
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_main_path_index_ = num_active_paths;
+}
+
+void PathTraceWorkGPU::compact_shadow_paths()
+{
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  const int num_active_paths =
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] +
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW];
+
+  /* Early out if there is nothing that needs to be compacted. */
+  if (num_active_paths == 0) {
+    if (integrator_next_shadow_path_index_.data()[0] != 0) {
+      integrator_next_shadow_path_index_.data()[0] = 0;
+      queue_->copy_to_device(integrator_next_shadow_path_index_);
+    }
+    return;
+  }
+
+  /* Compact if we can reduce the space used by half. Not always since
+   * compaction has a cost. */
+  const float shadow_compact_ratio = 0.5f;
+  const int min_compact_paths = 32;
+  if (integrator_next_shadow_path_index_.data()[0] < num_active_paths * shadow_compact_ratio ||
+      integrator_next_shadow_path_index_.data()[0] < min_compact_paths) {
     return;
   }
 
+  /* Compact. */
+  compact_paths(num_active_paths,
+                integrator_next_shadow_path_index_.data()[0],
+                DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES);
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  integrator_next_shadow_path_index_.data()[0] = num_active_paths;
+  queue_->copy_to_device(integrator_next_shadow_path_index_);
+}
+
+void PathTraceWorkGPU::compact_paths(const int num_active_paths,
+                                     const int max_active_path_index,
+                                     DeviceKernel terminated_paths_kernel,
+                                     DeviceKernel compact_paths_kernel,
+                                     DeviceKernel compact_kernel)
+{
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
   void *d_compact_paths = (void *)queued_paths_.device_pointer;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
 
@@ -557,17 +602,17 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths)
     int work_size = num_active_paths;
     void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
     queue_->zero_to_device(num_queued_paths_);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+    queue_->enqueue(terminated_paths_kernel, work_size, args);
   }
 
   /* Create array of paths that we need to compact, where the path index is bigger
    * than the number of active paths. */
   {
-    int work_size = max_active_main_path_index_;
+    int work_size = max_active_path_index;
     void *args[] = {
         &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
     queue_->zero_to_device(num_queued_paths_);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+    queue_->enqueue(compact_paths_kernel, work_size, args);
   }
 
   queue_->copy_from_device(num_queued_paths_);
@@ -582,13 +627,8 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths)
     int terminated_states_offset = num_active_paths;
     void *args[] = {
         &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+    queue_->enqueue(compact_kernel, work_size, args);
   }
-
-  queue_->synchronize();
-
-  /* Adjust max active path index now we know which part of the array is actually used. */
-  max_active_main_path_index_ = num_active_paths;
 }
 
 bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
@@ -669,7 +709,7 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
 
   /* Compact state array when number of paths becomes small relative to the
    * known maximum path index, which makes computing active index arrays slow. */
-  compact_states(num_active_paths);
+  compact_main_paths(num_active_paths);
 
   if (has_shadow_catcher()) {
     integrator_next_main_path_index_.data()[0] = num_paths;
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index e16c491695b..8734d2c2852 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -86,7 +86,13 @@ class PathTraceWorkGPU : public PathTraceWork {
                                    DeviceKernel queued_kernel,
                                    const int num_paths_limit);
 
-  void compact_states(const int num_active_paths);
+  void compact_main_paths(const int num_active_paths);
+  void compact_shadow_paths();
+  void compact_paths(const int num_active_paths,
+                     const int max_active_path_index,
+                     DeviceKernel terminated_paths_kernel,
+                     DeviceKernel compact_paths_kernel,
+                     DeviceKernel compact_kernel);
 
   int num_active_main_paths_paths();
 
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index fcb398f7e6d..eeac09d4b29 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
       });
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
+                                                        int *indices,
+                                                        int *num_indices,
+                                                        int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+      });
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     kernel_gpu_integrator_sorted_paths_array(int num_states,
                                              int num_states_limit,
@@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
   }
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
+                                                     int *indices,
+                                                     int *num_indices,
+                                                     int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
+                                                const int active_states_offset,
+                                                const int terminated_states_offset,
+                                                const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_shadow_state_move(NULL, to_state, from_state);
+  }
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
     kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
index 6da41cddcf8..6e6b7f8a40f 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -265,6 +265,62 @@ ccl_device_inline void integrator_state_move(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 }
 
+ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg,
+                                                         ConstIntegratorShadowState to_state,
+                                                         ConstIntegratorShadowState state)
+{
+  int index;
+
+  /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+#  define KERNEL_STRUCT_BEGIN(name) \
+    index = 0; \
+    do {
+
+#  define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct.name != nullptr) { \
+      kernel_integrator_state.parent_struct.name[to_state] = \
+          kernel_integrator_state.parent_struct.name[state]; \
+    }
+
+#  define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+      kernel_integrator_state.parent_struct[index].name[to_state] = \
+          kernel_integrator_state.parent_struct[index].name[state]; \
+    }
+
+#  define KERNEL_STRUCT_END(name) \
+    } \
+    while (false) \
+      ;
+
+#  define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
+    ++index; \
+    } \
+    while (index < gpu_array_size) \
+      ;
+
+#  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
+
+#  include "kernel/integrator/integrator_shadow_state_template.h"
+
+#  undef KERNEL_STRUCT_BEGIN
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_ARRAY_MEMBER
+#  undef KERNEL_STRUCT_END
+#  undef KERNEL_STRUCT_END_ARRAY
+#  undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+}
+
+ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,
+                                                    ConstIntegratorState to_state,
+                                                    ConstIntegratorState state)
+{
+  integrator_shadow_state_copy_only(kg, to_state, state);
+
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+}
+
 #endif
 
 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 4bdd8185ca6..5cbe2939dfc 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1458,6 +1458,9 @@ typedef enum DeviceKernel {
   DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES,
   DEVICE_KERNEL_INTEGRATOR_RESET,
   DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,