diff options
author | Brecht Van Lommel <brecht@blender.org> | 2021-10-17 21:43:06 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2021-10-18 20:02:10 +0300 |
commit | 3065d2609700d14100490a16c91152a6e71790e8 (patch) | |
tree | 66b248fd07c84064ee1907cb6890d2b4d76d71a2 /intern | |
parent | a184d0dd023cc0b6fee5e02510addb91d66f8e01 (diff) |
Cycles: optimize volume stack copying for shadow catcher/compaction
Only copy the number of items used instead of the max items.
Ref D12889
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/integrator/integrator_state_util.h | 56 |
2 files changed, 40 insertions, 18 deletions
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index 56beaf1fd91..b5ecab2a4db 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -321,7 +321,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B const int from_state = active_terminated_states[active_states_offset + global_index]; const int to_state = active_terminated_states[terminated_states_offset + global_index]; - integrator_state_move(to_state, from_state); + integrator_state_move(NULL, to_state, from_state); } } diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h index fee59e451d9..bb372f9e984 100644 --- a/intern/cycles/kernel/integrator/integrator_state_util.h +++ b/intern/cycles/kernel/integrator/integrator_state_util.h @@ -173,6 +173,25 @@ ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(KernelG } } +ccl_device_forceinline void integrator_state_copy_volume_stack(KernelGlobals kg, + IntegratorState to_state, + ConstIntegratorState state) +{ + if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) { + int index = 0; + int shader; + do { + shader = INTEGRATOR_STATE_ARRAY(state, volume_stack, index, shader); + + INTEGRATOR_STATE_ARRAY_WRITE(to_state, volume_stack, index, object) = INTEGRATOR_STATE_ARRAY( + state, volume_stack, index, object); + INTEGRATOR_STATE_ARRAY_WRITE(to_state, volume_stack, index, shader) = shader; + + ++index; + } while (shader != OBJECT_NONE); + } +} + ccl_device_forceinline VolumeStack integrator_state_read_shadow_volume_stack(ConstIntegratorState state, int i) { @@ -198,8 +217,9 @@ ccl_device_forceinline void integrator_state_write_shadow_volume_stack(Integrato } #if defined(__KERNEL_GPU__) -ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state, - const IntegratorState state) +ccl_device_inline void integrator_state_copy_only(KernelGlobals kg, + ConstIntegratorState to_state, + ConstIntegratorState state) { int index; @@ -232,7 +252,8 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state while (index < gpu_array_size) \ ; -# define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size +/* Don't copy volume stack here, do it after with just the number of items needed. */ +# define KERNEL_STRUCT_VOLUME_STACK_SIZE 0 # include "kernel/integrator/integrator_state_template.h" @@ -242,12 +263,15 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state # undef KERNEL_STRUCT_END # undef KERNEL_STRUCT_END_ARRAY # undef KERNEL_STRUCT_VOLUME_STACK_SIZE + + integrator_state_copy_volume_stack(kg, to_state, state); } -ccl_device_inline void integrator_state_move(const IntegratorState to_state, - const IntegratorState state) +ccl_device_inline void integrator_state_move(KernelGlobals kg, + ConstIntegratorState to_state, + ConstIntegratorState state) { - integrator_state_copy_only(to_state, state); + integrator_state_copy_only(kg, to_state, state); INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; @@ -264,22 +288,20 @@ ccl_device_inline void integrator_state_shadow_catcher_split(KernelGlobals kg, const IntegratorState to_state = atomic_fetch_and_add_uint32( &kernel_integrator_state.next_shadow_catcher_path_index[0], 1); - integrator_state_copy_only(to_state, state); - - kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS; + integrator_state_copy_only(kg, to_state, state); #else - IntegratorStateCPU *ccl_restrict split_state = state + 1; + IntegratorStateCPU *ccl_restrict to_state = state + 1; /* Only copy the required subset, since shadow intersections are big and irrelevant here. */ - split_state->path = state->path; - split_state->ray = state->ray; - split_state->isect = state->isect; - memcpy(split_state->volume_stack, state->volume_stack, sizeof(state->volume_stack)); - split_state->shadow_path = state->shadow_path; - - split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS; + to_state->path = state->path; + to_state->ray = state->ray; + to_state->isect = state->isect; + integrator_state_copy_volume_stack(kg, to_state, state); + to_state->shadow_path = state->shadow_path; #endif + + INTEGRATOR_STATE_WRITE(to_state, path, flag) |= PATH_RAY_SHADOW_CATCHER_PASS; } CCL_NAMESPACE_END |