diff options
author | Brecht Van Lommel <brecht> | 2021-10-21 16:14:30 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2021-10-21 16:38:03 +0300 |
commit | df004637643241136a3294a63c7d4ca865cdea98 (patch) | |
tree | cfd103da7148e930b2b6bfdf1f4848824b3e6d64 /intern/cycles/kernel | |
parent | fd560ef2af6aef06e6dad00854bfdd3fd81a8d6f (diff) |
Cycles: add shadow path compaction for GPU rendering
Similar to main path compaction that happens before adding work tiles, this
compacts shadow paths before launching kernels that may add shadow paths.
Only do it when more than 50% of space is wasted.
It's not a clear win in all scenes, some are up to 1.5% slower. Likely caused
by different order of scheduling kernels having an unpredictable performance
impact. Still feels like compaction is just the right thing to avoid cases
where a few shadow paths can hold up a lot of main paths.
Differential Revision: https://developer.blender.org/D12944
Diffstat (limited to 'intern/cycles/kernel')
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 41 | ||||
-rw-r--r-- | intern/cycles/kernel/integrator/integrator_state_util.h | 56 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_types.h | 3 |
3 files changed, 100 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index fcb398f7e6d..eeac09d4b29 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B }); } +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_terminated_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int indices_offset) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices + indices_offset, num_indices, [](const int state) { + return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); + }); +} + extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) kernel_gpu_integrator_sorted_paths_array(int num_states, int num_states_limit, @@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B } } +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int num_active_paths) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [num_active_paths](const int state) { + return (state >= num_active_paths) && + (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int from_state = active_terminated_states[active_states_offset + global_index]; + const int to_state = active_terminated_states[terminated_states_offset + global_index]; + + integrator_shadow_state_move(NULL, to_state, from_state); + } +} + extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values) { diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h index 6da41cddcf8..6e6b7f8a40f 100644 --- a/intern/cycles/kernel/integrator/integrator_state_util.h +++ b/intern/cycles/kernel/integrator/integrator_state_util.h @@ -265,6 +265,62 @@ ccl_device_inline void integrator_state_move(KernelGlobals kg, INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; } +ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg, + ConstIntegratorShadowState to_state, + ConstIntegratorShadowState state) +{ + int index; + + /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */ + +# define KERNEL_STRUCT_BEGIN(name) \ + index = 0; \ + do { + +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + if (kernel_integrator_state.parent_struct.name != nullptr) { \ + kernel_integrator_state.parent_struct.name[to_state] = \ + kernel_integrator_state.parent_struct.name[state]; \ + } + +# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + if (kernel_integrator_state.parent_struct[index].name != nullptr) { \ + kernel_integrator_state.parent_struct[index].name[to_state] = \ + kernel_integrator_state.parent_struct[index].name[state]; \ + } + +# define KERNEL_STRUCT_END(name) \ + } \ + while (false) \ + ; + +# define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \ + ++index; \ + } \ + while (index < gpu_array_size) \ + ; + +# define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size + +# include "kernel/integrator/integrator_shadow_state_template.h" + +# undef KERNEL_STRUCT_BEGIN +# undef KERNEL_STRUCT_MEMBER +# undef KERNEL_STRUCT_ARRAY_MEMBER +# undef KERNEL_STRUCT_END +# undef KERNEL_STRUCT_END_ARRAY +# undef KERNEL_STRUCT_VOLUME_STACK_SIZE +} + +ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg, + ConstIntegratorState to_state, + ConstIntegratorState state) +{ + integrator_shadow_state_copy_only(kg, to_state, state); + + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; +} + #endif /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 4bdd8185ca6..5cbe2939dfc 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -1458,6 +1458,9 @@ typedef enum DeviceKernel { DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, + DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES, DEVICE_KERNEL_INTEGRATOR_RESET, DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, |