diff options
author | Brecht Van Lommel <brecht> | 2021-10-21 16:14:30 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2021-10-21 16:38:03 +0300 |
commit | df004637643241136a3294a63c7d4ca865cdea98 (patch) | |
tree | cfd103da7148e930b2b6bfdf1f4848824b3e6d64 /intern/cycles/kernel/device | |
parent | fd560ef2af6aef06e6dad00854bfdd3fd81a8d6f (diff) |
Cycles: add shadow path compaction for GPU rendering
Similar to main path compaction that happens before adding work tiles, this
compacts shadow paths before launching kernels that may add shadow paths.
Only do it when more than 50% of space is wasted.
It's not a clear win in all scenes, some are up to 1.5% slower. Likely caused
by different order of scheduling kernels having an unpredictable performance
impact. Still feels like compaction is just the right thing to avoid cases
where a few shadow paths can hold up a lot of main paths.
Differential Revision: https://developer.blender.org/D12944
Diffstat (limited to 'intern/cycles/kernel/device')
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index fcb398f7e6d..eeac09d4b29 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B }); } +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_terminated_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int indices_offset) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices + indices_offset, num_indices, [](const int state) { + return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); + }); +} + extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) kernel_gpu_integrator_sorted_paths_array(int num_states, int num_states_limit, @@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B } } +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int num_active_paths) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [num_active_paths](const int state) { + return (state >= num_active_paths) && + (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int from_state = active_terminated_states[active_states_offset + global_index]; + const int to_state = active_terminated_states[terminated_states_offset + global_index]; + + integrator_shadow_state_move(NULL, to_state, from_state); + } +} + extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values) { |