From 3065d2609700d14100490a16c91152a6e71790e8 Mon Sep 17 00:00:00 2001 From: Brecht Van Lommel Date: Sun, 17 Oct 2021 20:43:06 +0200 Subject: Cycles: optimize volume stack copying for shadow catcher/compaction Only copy the number of items used instead of the max items. Ref D12889 --- intern/cycles/kernel/device/gpu/kernel.h | 2 +- .../kernel/integrator/integrator_state_util.h | 56 +++++++++++++++------- 2 files changed, 40 insertions(+), 18 deletions(-) (limited to 'intern/cycles/kernel') diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 56beaf1fd91..b5ecab2a4db 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -321,7 +321,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; - integrator_state_move(to_state, from_state); + integrator_state_move(NULL, to_state, from_state); } } diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h index fee59e451d9..bb372f9e984 100644 --- a/intern/cycles/kernel/integrator/integrator_state_util.h +++ b/intern/cycles/kernel/integrator/integrator_state_util.h @@ -173,6 +173,25 @@ ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(KernelG } } +ccl_device_forceinline void integrator_state_copy_volume_stack(KernelGlobals kg, + IntegratorState to_state, + ConstIntegratorState state) +{ + if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) { + int index = 0; + int shader; + do { + shader = INTEGRATOR_STATE_ARRAY(state, volume_stack, index, shader); + + INTEGRATOR_STATE_ARRAY_WRITE(to_state, volume_stack, index, object) = INTEGRATOR_STATE_ARRAY( + state, volume_stack, index, object); + INTEGRATOR_STATE_ARRAY_WRITE(to_state, volume_stack, index, shader) = shader; + + ++index; + } while (shader != OBJECT_NONE); + } +} + ccl_device_forceinline VolumeStack integrator_state_read_shadow_volume_stack(ConstIntegratorState state, int i) { @@ -198,8 +217,9 @@ ccl_device_forceinline void integrator_state_write_shadow_volume_stack(Integrato } #if defined(__KERNEL_GPU__) -ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state, - const IntegratorState state) +ccl_device_inline void integrator_state_copy_only(KernelGlobals kg, + ConstIntegratorState to_state, + ConstIntegratorState state) { int index; @@ -232,7 +252,8 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state while (index < gpu_array_size) \ ; -# define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size +/* Don't copy volume stack here, do it after with just the number of items needed. */ +# define KERNEL_STRUCT_VOLUME_STACK_SIZE 0 # include "kernel/integrator/integrator_state_template.h" @@ -242,12 +263,15 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state # undef KERNEL_STRUCT_END # undef KERNEL_STRUCT_END_ARRAY # undef KERNEL_STRUCT_VOLUME_STACK_SIZE + + integrator_state_copy_volume_stack(kg, to_state, state); } -ccl_device_inline void integrator_state_move(const IntegratorState to_state, - const IntegratorState state) +ccl_device_inline void integrator_state_move(KernelGlobals kg, + ConstIntegratorState to_state, + ConstIntegratorState state) { - integrator_state_copy_only(to_state, state); + integrator_state_copy_only(kg, to_state, state); INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; @@ -264,22 +288,20 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg, const IntegratorState to_state = atomic_fetch_and_add_uint32( &kernel_integrator_state.next_shadow_catcher_path_index[0], 1); - integrator_state_copy_only(to_state, state); - - kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS; + integrator_state_copy_only(kg, to_state, state); #else - IntegratorStateCPU *ccl_restrict split_state = state + 1; + IntegratorStateCPU *ccl_restrict to_state = state + 1; /* Only copy the required subset, since shadow intersections are big and irrelevant here. */ - split_state->path = state->path; - split_state->ray = state->ray; - split_state->isect = state->isect; - memcpy(split_state->volume_stack, state->volume_stack, sizeof(state->volume_stack)); - split_state->shadow_path = state->shadow_path; - - split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS; + to_state->path = state->path; + to_state->ray = state->ray; + to_state->isect = state->isect; + integrator_state_copy_volume_stack(kg, to_state, state); + to_state->shadow_path = state->shadow_path; #endif + + INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS; } CCL_NAMESPACE_END -- cgit v1.2.3