Cycles: add shadow path compaction for GPU rendering

Similar to main path compaction that happens before adding work tiles, this compacts shadow paths before launching kernels that may add shadow paths. Only do it when more than 50% of space is wasted. It's not a clear win in all scenes, some are up to 1.5% slower. Likely caused by different order of scheduling kernels having an unpredictable performance impact. Still feels like compaction is just the right thing to avoid cases where a few shadow paths can hold up a lot of main paths. Differential Revision: https://developer.blender.org/D12944
author: Brecht Van Lommel <brecht> 2021-10-21 16:14:30 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-21 16:38:03 +0300
commit: df004637643241136a3294a63c7d4ca865cdea98 (patch)
tree: cfd103da7148e930b2b6bfdf1f4848824b3e6d64 /intern/cycles
parent: fd560ef2af6aef06e6dad00854bfdd3fd81a8d6f (diff)
8 files changed, 186 insertions, 30 deletions
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 6b2c9a40082..09352a84181 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -113,6 +113,8 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
       /* See parall_active_index.h for why this amount of shared memory is needed. */
       shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
       break;
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index e0833331b77..1e282aac57e 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -64,6 +64,12 @@ const char *device_kernel_as_string(DeviceKernel kernel)
       return "integrator_compact_paths_array";
     case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
       return "integrator_compact_states";
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+      return "integrator_terminated_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
+      return "integrator_compact_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
+      return "integrator_compact_shadow_states";
     case DEVICE_KERNEL_INTEGRATOR_RESET:
       return "integrator_reset";
     case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp
index a612f59fb32..0f053ccbeb5 100644
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -113,6 +113,8 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg
     case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
       /* See parall_active_index.h for why this amount of shared memory is needed. */
       shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
       break;
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 2c71b1cf876..36f275e1075 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -361,26 +361,13 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
     return false;
   }
 
-  /* If the number of shadow kernels dropped to zero, set the next shadow path
-   * index to zero as well.
-   *
-   * TODO: use shadow path compaction to lower it more often instead of letting
-   * it fill up entirely? */
-  const int num_queued_shadow =
-      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] +
-      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW];
-  if (num_queued_shadow == 0) {
-    if (integrator_next_shadow_path_index_.data()[0] != 0) {
-      integrator_next_shadow_path_index_.data()[0] = 0;
-      queue_->copy_to_device(integrator_next_shadow_path_index_);
-    }
-  }
-
   /* For kernels that add shadow paths, check if there is enough space available.
    * If not, schedule shadow kernels first to clear out the shadow paths. */
   int num_paths_limit = INT_MAX;
 
   if (kernel_creates_shadow_paths(kernel)) {
+    compact_shadow_paths();
+
     const int available_shadow_paths = max_num_paths_ -
                                        integrator_next_shadow_path_index_.data()[0];
     if (available_shadow_paths < queue_counter->num_queued[kernel]) {
@@ -535,18 +522,76 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu
   queue_->enqueue(kernel, work_size, args);
 }
 
-void PathTraceWorkGPU::compact_states(const int num_active_paths)
+void PathTraceWorkGPU::compact_main_paths(const int num_active_paths)
 {
+  /* Early out if there is nothing that needs to be compacted. */
   if (num_active_paths == 0) {
     max_active_main_path_index_ = 0;
+    return;
   }
 
-  /* Compact fragmented path states into the start of the array, moving any paths
-   * with index higher than the number of active paths into the gaps. */
-  if (max_active_main_path_index_ == num_active_paths) {
+  const int min_compact_paths = 32;
+  if (max_active_main_path_index_ == num_active_paths ||
+      max_active_main_path_index_ < min_compact_paths) {
+    return;
+  }
+
+  /* Compact. */
+  compact_paths(num_active_paths,
+                max_active_main_path_index_,
+                DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES);
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_main_path_index_ = num_active_paths;
+}
+
+void PathTraceWorkGPU::compact_shadow_paths()
+{
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  const int num_active_paths =
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] +
+      queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW];
+
+  /* Early out if there is nothing that needs to be compacted. */
+  if (num_active_paths == 0) {
+    if (integrator_next_shadow_path_index_.data()[0] != 0) {
+      integrator_next_shadow_path_index_.data()[0] = 0;
+      queue_->copy_to_device(integrator_next_shadow_path_index_);
+    }
+    return;
+  }
+
+  /* Compact if we can reduce the space used by half. Not always since
+   * compaction has a cost. */
+  const float shadow_compact_ratio = 0.5f;
+  const int min_compact_paths = 32;
+  if (integrator_next_shadow_path_index_.data()[0] < num_active_paths * shadow_compact_ratio ||
+      integrator_next_shadow_path_index_.data()[0] < min_compact_paths) {
     return;
   }
 
+  /* Compact. */
+  compact_paths(num_active_paths,
+                integrator_next_shadow_path_index_.data()[0],
+                DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY,
+                DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES);
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  integrator_next_shadow_path_index_.data()[0] = num_active_paths;
+  queue_->copy_to_device(integrator_next_shadow_path_index_);
+}
+
+void PathTraceWorkGPU::compact_paths(const int num_active_paths,
+                                     const int max_active_path_index,
+                                     DeviceKernel terminated_paths_kernel,
+                                     DeviceKernel compact_paths_kernel,
+                                     DeviceKernel compact_kernel)
+{
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
   void *d_compact_paths = (void *)queued_paths_.device_pointer;
   void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
 
@@ -557,17 +602,17 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths)
     int work_size = num_active_paths;
     void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
     queue_->zero_to_device(num_queued_paths_);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+    queue_->enqueue(terminated_paths_kernel, work_size, args);
   }
 
   /* Create array of paths that we need to compact, where the path index is bigger
    * than the number of active paths. */
   {
-    int work_size = max_active_main_path_index_;
+    int work_size = max_active_path_index;
     void *args[] = {
         &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
     queue_->zero_to_device(num_queued_paths_);
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+    queue_->enqueue(compact_paths_kernel, work_size, args);
   }
 
   queue_->copy_from_device(num_queued_paths_);
@@ -582,13 +627,8 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths)
     int terminated_states_offset = num_active_paths;
     void *args[] = {
         &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
-    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+    queue_->enqueue(compact_kernel, work_size, args);
   }
-
-  queue_->synchronize();
-
-  /* Adjust max active path index now we know which part of the array is actually used. */
-  max_active_main_path_index_ = num_active_paths;
 }
 
 bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
@@ -669,7 +709,7 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
 
   /* Compact state array when number of paths becomes small relative to the
    * known maximum path index, which makes computing active index arrays slow. */
-  compact_states(num_active_paths);
+  compact_main_paths(num_active_paths);
 
   if (has_shadow_catcher()) {
     integrator_next_main_path_index_.data()[0] = num_paths;
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index e16c491695b..8734d2c2852 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -86,7 +86,13 @@ class PathTraceWorkGPU : public PathTraceWork {
                                    DeviceKernel queued_kernel,
                                    const int num_paths_limit);
 
-  void compact_states(const int num_active_paths);
+  void compact_main_paths(const int num_active_paths);
+  void compact_shadow_paths();
+  void compact_paths(const int num_active_paths,
+                     const int max_active_path_index,
+                     DeviceKernel terminated_paths_kernel,
+                     DeviceKernel compact_paths_kernel,
+                     DeviceKernel compact_kernel);
 
   int num_active_main_paths_paths();
 
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index fcb398f7e6d..eeac09d4b29 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
       });
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
+                                                        int *indices,
+                                                        int *num_indices,
+                                                        int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+      });
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     kernel_gpu_integrator_sorted_paths_array(int num_states,
                                              int num_states_limit,
@@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
   }
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
+                                                     int *indices,
+                                                     int *num_indices,
+                                                     int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
+                                                const int active_states_offset,
+                                                const int terminated_states_offset,
+                                                const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_shadow_state_move(NULL, to_state, from_state);
+  }
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
     kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
index 6da41cddcf8..6e6b7f8a40f 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -265,6 +265,62 @@ ccl_device_inline void integrator_state_move(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 }
 
+ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg,
+                                                         ConstIntegratorShadowState to_state,
+                                                         ConstIntegratorShadowState state)
+{
+  int index;
+
+  /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+#  define KERNEL_STRUCT_BEGIN(name) \
+    index = 0; \
+    do {
+
+#  define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct.name != nullptr) { \
+      kernel_integrator_state.parent_struct.name[to_state] = \
+          kernel_integrator_state.parent_struct.name[state]; \
+    }
+
+#  define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+      kernel_integrator_state.parent_struct[index].name[to_state] = \
+          kernel_integrator_state.parent_struct[index].name[state]; \
+    }
+
+#  define KERNEL_STRUCT_END(name) \
+    } \
+    while (false) \
+      ;
+
+#  define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
+    ++index; \
+    } \
+    while (index < gpu_array_size) \
+      ;
+
+#  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
+
+#  include "kernel/integrator/integrator_shadow_state_template.h"
+
+#  undef KERNEL_STRUCT_BEGIN
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_ARRAY_MEMBER
+#  undef KERNEL_STRUCT_END
+#  undef KERNEL_STRUCT_END_ARRAY
+#  undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+}
+
+ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,
+                                                    ConstIntegratorState to_state,
+                                                    ConstIntegratorState state)
+{
+  integrator_shadow_state_copy_only(kg, to_state, state);
+
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+}
+
 #endif
 
 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 4bdd8185ca6..5cbe2939dfc 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1458,6 +1458,9 @@ typedef enum DeviceKernel {
   DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES,
   DEVICE_KERNEL_INTEGRATOR_RESET,
   DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,
author	Brecht Van Lommel <brecht>	2021-10-21 16:14:30 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-21 16:38:03 +0300
commit	df004637643241136a3294a63c7d4ca865cdea98 (patch)
tree	cfd103da7148e930b2b6bfdf1f4848824b3e6d64 /intern/cycles
parent	fd560ef2af6aef06e6dad00854bfdd3fd81a8d6f (diff)