Cycles: add shadow path compaction for GPU rendering

Similar to main path compaction that happens before adding work tiles, this compacts shadow paths before launching kernels that may add shadow paths. Only do it when more than 50% of space is wasted. It's not a clear win in all scenes, some are up to 1.5% slower. Likely caused by different order of scheduling kernels having an unpredictable performance impact. Still feels like compaction is just the right thing to avoid cases where a few shadow paths can hold up a lot of main paths. Differential Revision: https://developer.blender.org/D12944
author: Brecht Van Lommel <brecht> 2021-10-21 16:14:30 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-10-21 16:38:03 +0300
commit: df004637643241136a3294a63c7d4ca865cdea98 (patch)
tree: cfd103da7148e930b2b6bfdf1f4848824b3e6d64 /intern/cycles/kernel
parent: fd560ef2af6aef06e6dad00854bfdd3fd81a8d6f (diff)
3 files changed, 100 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index fcb398f7e6d..eeac09d4b29 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
       });
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
+                                                        int *indices,
+                                                        int *num_indices,
+                                                        int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+      });
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     kernel_gpu_integrator_sorted_paths_array(int num_states,
                                              int num_states_limit,
@@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
   }
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
+                                                     int *indices,
+                                                     int *num_indices,
+                                                     int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
+                                                const int active_states_offset,
+                                                const int terminated_states_offset,
+                                                const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_shadow_state_move(NULL, to_state, from_state);
+  }
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
     kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
index 6da41cddcf8..6e6b7f8a40f 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -265,6 +265,62 @@ ccl_device_inline void integrator_state_move(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
 }
 
+ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg,
+                                                         ConstIntegratorShadowState to_state,
+                                                         ConstIntegratorShadowState state)
+{
+  int index;
+
+  /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+#  define KERNEL_STRUCT_BEGIN(name) \
+    index = 0; \
+    do {
+
+#  define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct.name != nullptr) { \
+      kernel_integrator_state.parent_struct.name[to_state] = \
+          kernel_integrator_state.parent_struct.name[state]; \
+    }
+
+#  define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+      kernel_integrator_state.parent_struct[index].name[to_state] = \
+          kernel_integrator_state.parent_struct[index].name[state]; \
+    }
+
+#  define KERNEL_STRUCT_END(name) \
+    } \
+    while (false) \
+      ;
+
+#  define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
+    ++index; \
+    } \
+    while (index < gpu_array_size) \
+      ;
+
+#  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
+
+#  include "kernel/integrator/integrator_shadow_state_template.h"
+
+#  undef KERNEL_STRUCT_BEGIN
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_ARRAY_MEMBER
+#  undef KERNEL_STRUCT_END
+#  undef KERNEL_STRUCT_END_ARRAY
+#  undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+}
+
+ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg,
+                                                    ConstIntegratorState to_state,
+                                                    ConstIntegratorState state)
+{
+  integrator_shadow_state_copy_only(kg, to_state, state);
+
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+}
+
 #endif
 
 /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 4bdd8185ca6..5cbe2939dfc 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1458,6 +1458,9 @@ typedef enum DeviceKernel {
   DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
   DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES,
   DEVICE_KERNEL_INTEGRATOR_RESET,
   DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,
author	Brecht Van Lommel <brecht>	2021-10-21 16:14:30 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-10-21 16:38:03 +0300
commit	df004637643241136a3294a63c7d4ca865cdea98 (patch)
tree	cfd103da7148e930b2b6bfdf1f4848824b3e6d64 /intern/cycles/kernel
parent	fd560ef2af6aef06e6dad00854bfdd3fd81a8d6f (diff)