diff options
Diffstat (limited to 'intern/cycles/kernel/integrator/state_flow.h')
-rw-r--r-- | intern/cycles/kernel/integrator/state_flow.h | 273 |
1 files changed, 172 insertions, 101 deletions
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h index fed74d49434..4b03c665e17 100644 --- a/intern/cycles/kernel/integrator/state_flow.h +++ b/intern/cycles/kernel/integrator/state_flow.h @@ -10,125 +10,196 @@ CCL_NAMESPACE_BEGIN /* Control Flow * - * Utilities for control flow between kernels. The implementation may differ per device - * or even be handled on the host side. To abstract such differences, experiment with - * different implementations and for debugging, this is abstracted using macros. + * Utilities for control flow between kernels. The implementation is different between CPU and + * GPU devices. For the latter part of the logic is handled on the host side with wavefronts. * * There is a main path for regular path tracing camera for path tracing. Shadows for next * event estimation branch off from this into their own path, that may be computed in - * parallel while the main path continues. + * parallel while the main path continues. Additionally, shading kernels are sorted using + * a key for coherence. * * Each kernel on the main path must call one of these functions. These may not be called * multiple times from the same kernel. * - * INTEGRATOR_PATH_INIT(next_kernel) - * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) - * INTEGRATOR_PATH_TERMINATE(current_kernel) + * integrator_path_init(kg, state, next_kernel) + * integrator_path_next(kg, state, current_kernel, next_kernel) + * integrator_path_terminate(kg, state, current_kernel) * * For the shadow path similar functions are used, and again each shadow kernel must call * one of them, and only once. */ -#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(state, path, queued_kernel) == 0) -#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED \ - (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0) +ccl_device_forceinline bool integrator_path_is_terminated(ConstIntegratorState state) +{ + return INTEGRATOR_STATE(state, path, queued_kernel) == 0; +} + +ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegratorShadowState state) +{ + return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0; +} #ifdef __KERNEL_GPU__ -# define INTEGRATOR_PATH_INIT(next_kernel) \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; -# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; -# define INTEGRATOR_PATH_TERMINATE(current_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; - -# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \ - IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \ - &kernel_integrator_state.next_shadow_path_index[0], 1); \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; -# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \ - 1); \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; -# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; - -# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \ - { \ - const int key_ = key; \ - atomic_fetch_and_add_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \ - 1); \ - } -# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \ - { \ - const int key_ = key; \ - atomic_fetch_and_sub_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ - atomic_fetch_and_add_uint32( \ - &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \ - atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \ - 1); \ - } +ccl_device_forceinline void integrator_path_init(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel) +{ + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_path_next(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; +} + +ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init( + KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao) +{ + IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( + &kernel_integrator_state.next_shadow_path_index[0], 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; + return shadow_state; +} + +ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel) +{ + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; +} + +/* Sort first by truncated state index (for good locality), then by key (for good coherence). */ +# define INTEGRATOR_SORT_KEY(key, state) \ + (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor)) + +ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel, + const uint32_t key) +{ + const int key_ = INTEGRATOR_SORT_KEY(key, state); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; + atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1); +} + +ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel, + const uint32_t key) +{ + const int key_ = INTEGRATOR_SORT_KEY(key, state); + atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel], + 1); + atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; + atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1); +} #else -# define INTEGRATOR_PATH_INIT(next_kernel) \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; -# define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - (void)key; \ - } -# define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - (void)current_kernel; \ - } -# define INTEGRATOR_PATH_TERMINATE(current_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; \ - (void)current_kernel; \ - } -# define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \ - { \ - INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ - (void)key; \ - (void)current_kernel; \ - } - -# define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \ - IntegratorShadowState shadow_state = &state->shadow_type; \ - INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; -# define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; \ - (void)current_kernel; \ - } -# define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \ - { \ - INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; \ - (void)current_kernel; \ - } +ccl_device_forceinline void integrator_path_init(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; +} + +ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel next_kernel, + const uint32_t key) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + (void)key; +} + +ccl_device_forceinline void integrator_path_next(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + (void)current_kernel; +} + +ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; + (void)current_kernel; +} + +ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg, + IntegratorState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel, + const uint32_t key) +{ + INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; + (void)key; + (void)current_kernel; +} + +ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init( + KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao) +{ + IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow; + INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel; + return shadow_state; +} + +ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel, + const DeviceKernel next_kernel) +{ + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; + (void)current_kernel; +} + +ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg, + IntegratorShadowState state, + const DeviceKernel current_kernel) +{ + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; + (void)current_kernel; +} #endif |