diff options
-rw-r--r-- | intern/cycles/device/cuda/queue.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/device_kernel.cpp | 6 | ||||
-rw-r--r-- | intern/cycles/device/hip/queue.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 98 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.h | 8 | ||||
-rw-r--r-- | intern/cycles/kernel/device/gpu/kernel.h | 41 | ||||
-rw-r--r-- | intern/cycles/kernel/integrator/integrator_state_util.h | 56 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_types.h | 3 |
8 files changed, 186 insertions, 30 deletions
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 6b2c9a40082..09352a84181 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -113,6 +113,8 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: /* See parall_active_index.h for why this amount of shared memory is needed. */ shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); break; diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp index e0833331b77..1e282aac57e 100644 --- a/intern/cycles/device/device_kernel.cpp +++ b/intern/cycles/device/device_kernel.cpp @@ -64,6 +64,12 @@ const char *device_kernel_as_string(DeviceKernel kernel) return "integrator_compact_paths_array"; case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: return "integrator_compact_states"; + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: + return "integrator_terminated_shadow_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: + return "integrator_compact_shadow_paths_array"; + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: + return "integrator_compact_shadow_states"; case DEVICE_KERNEL_INTEGRATOR_RESET: return "integrator_reset"; case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index a612f59fb32..0f053ccbeb5 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -113,6 +113,8 @@ bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *arg case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: /* See parall_active_index.h for why this amount of shared memory is needed. */ shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); break; diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 2c71b1cf876..36f275e1075 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -361,26 +361,13 @@ bool PathTraceWorkGPU::enqueue_path_iteration() return false; } - /* If the number of shadow kernels dropped to zero, set the next shadow path - * index to zero as well. - * - * TODO: use shadow path compaction to lower it more often instead of letting - * it fill up entirely? */ - const int num_queued_shadow = - queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] + - queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]; - if (num_queued_shadow == 0) { - if (integrator_next_shadow_path_index_.data()[0] != 0) { - integrator_next_shadow_path_index_.data()[0] = 0; - queue_->copy_to_device(integrator_next_shadow_path_index_); - } - } - /* For kernels that add shadow paths, check if there is enough space available. * If not, schedule shadow kernels first to clear out the shadow paths. */ int num_paths_limit = INT_MAX; if (kernel_creates_shadow_paths(kernel)) { + compact_shadow_paths(); + const int available_shadow_paths = max_num_paths_ - integrator_next_shadow_path_index_.data()[0]; if (available_shadow_paths < queue_counter->num_queued[kernel]) { @@ -535,18 +522,76 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel qu queue_->enqueue(kernel, work_size, args); } -void PathTraceWorkGPU::compact_states(const int num_active_paths) +void PathTraceWorkGPU::compact_main_paths(const int num_active_paths) { + /* Early out if there is nothing that needs to be compacted. */ if (num_active_paths == 0) { max_active_main_path_index_ = 0; + return; } - /* Compact fragmented path states into the start of the array, moving any paths - * with index higher than the number of active paths into the gaps. */ - if (max_active_main_path_index_ == num_active_paths) { + const int min_compact_paths = 32; + if (max_active_main_path_index_ == num_active_paths || + max_active_main_path_index_ < min_compact_paths) { + return; + } + + /* Compact. */ + compact_paths(num_active_paths, + max_active_main_path_index_, + DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES); + + /* Adjust max active path index now we know which part of the array is actually used. */ + max_active_main_path_index_ = num_active_paths; +} + +void PathTraceWorkGPU::compact_shadow_paths() +{ + IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data(); + const int num_active_paths = + queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] + + queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]; + + /* Early out if there is nothing that needs to be compacted. */ + if (num_active_paths == 0) { + if (integrator_next_shadow_path_index_.data()[0] != 0) { + integrator_next_shadow_path_index_.data()[0] = 0; + queue_->copy_to_device(integrator_next_shadow_path_index_); + } + return; + } + + /* Compact if we can reduce the space used by half. Not always since + * compaction has a cost. */ + const float shadow_compact_ratio = 0.5f; + const int min_compact_paths = 32; + if (integrator_next_shadow_path_index_.data()[0] < num_active_paths * shadow_compact_ratio || + integrator_next_shadow_path_index_.data()[0] < min_compact_paths) { return; } + /* Compact. */ + compact_paths(num_active_paths, + integrator_next_shadow_path_index_.data()[0], + DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES); + + /* Adjust max active path index now we know which part of the array is actually used. */ + integrator_next_shadow_path_index_.data()[0] = num_active_paths; + queue_->copy_to_device(integrator_next_shadow_path_index_); +} + +void PathTraceWorkGPU::compact_paths(const int num_active_paths, + const int max_active_path_index, + DeviceKernel terminated_paths_kernel, + DeviceKernel compact_paths_kernel, + DeviceKernel compact_kernel) +{ + /* Compact fragmented path states into the start of the array, moving any paths + * with index higher than the number of active paths into the gaps. */ void *d_compact_paths = (void *)queued_paths_.device_pointer; void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer; @@ -557,17 +602,17 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths) int work_size = num_active_paths; void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset}; queue_->zero_to_device(num_queued_paths_); - queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args); + queue_->enqueue(terminated_paths_kernel, work_size, args); } /* Create array of paths that we need to compact, where the path index is bigger * than the number of active paths. */ { - int work_size = max_active_main_path_index_; + int work_size = max_active_path_index; void *args[] = { &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)}; queue_->zero_to_device(num_queued_paths_); - queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args); + queue_->enqueue(compact_paths_kernel, work_size, args); } queue_->copy_from_device(num_queued_paths_); @@ -582,13 +627,8 @@ void PathTraceWorkGPU::compact_states(const int num_active_paths) int terminated_states_offset = num_active_paths; void *args[] = { &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size}; - queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args); + queue_->enqueue(compact_kernel, work_size, args); } - - queue_->synchronize(); - - /* Adjust max active path index now we know which part of the array is actually used. */ - max_active_main_path_index_ = num_active_paths; } bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished) @@ -669,7 +709,7 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished) /* Compact state array when number of paths becomes small relative to the * known maximum path index, which makes computing active index arrays slow. */ - compact_states(num_active_paths); + compact_main_paths(num_active_paths); if (has_shadow_catcher()) { integrator_next_main_path_index_.data()[0] = num_paths; diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index e16c491695b..8734d2c2852 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -86,7 +86,13 @@ class PathTraceWorkGPU : public PathTraceWork { DeviceKernel queued_kernel, const int num_paths_limit); - void compact_states(const int num_active_paths); + void compact_main_paths(const int num_active_paths); + void compact_shadow_paths(); + void compact_paths(const int num_active_paths, + const int max_active_path_index, + DeviceKernel terminated_paths_kernel, + DeviceKernel compact_paths_kernel, + DeviceKernel compact_kernel); int num_active_main_paths_paths(); diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h index fcb398f7e6d..eeac09d4b29 100644 --- a/intern/cycles/kernel/device/gpu/kernel.h +++ b/intern/cycles/kernel/device/gpu/kernel.h @@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B }); } +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_terminated_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int indices_offset) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices + indices_offset, num_indices, [](const int state) { + return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0); + }); +} + extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) kernel_gpu_integrator_sorted_paths_array(int num_states, int num_states_limit, @@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B } } +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_paths_array(int num_states, + int *indices, + int *num_indices, + int num_active_paths) +{ + gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>( + num_states, indices, num_indices, [num_active_paths](const int state) { + return (state >= num_active_paths) && + (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0); + }); +} + +extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE) + kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states, + const int active_states_offset, + const int terminated_states_offset, + const int work_size) +{ + const int global_index = ccl_gpu_global_id_x(); + + if (global_index < work_size) { + const int from_state = active_terminated_states[active_states_offset + global_index]; + const int to_state = active_terminated_states[terminated_states_offset + global_index]; + + integrator_shadow_state_move(NULL, to_state, from_state); + } +} + extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values) { diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h index 6da41cddcf8..6e6b7f8a40f 100644 --- a/intern/cycles/kernel/integrator/integrator_state_util.h +++ b/intern/cycles/kernel/integrator/integrator_state_util.h @@ -265,6 +265,62 @@ ccl_device_inline void integrator_state_move(KernelGlobals kg, INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; } +ccl_device_inline void integrator_shadow_state_copy_only(KernelGlobals kg, + ConstIntegratorShadowState to_state, + ConstIntegratorShadowState state) +{ + int index; + + /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */ + +# define KERNEL_STRUCT_BEGIN(name) \ + index = 0; \ + do { + +# define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + if (kernel_integrator_state.parent_struct.name != nullptr) { \ + kernel_integrator_state.parent_struct.name[to_state] = \ + kernel_integrator_state.parent_struct.name[state]; \ + } + +# define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + if (kernel_integrator_state.parent_struct[index].name != nullptr) { \ + kernel_integrator_state.parent_struct[index].name[to_state] = \ + kernel_integrator_state.parent_struct[index].name[state]; \ + } + +# define KERNEL_STRUCT_END(name) \ + } \ + while (false) \ + ; + +# define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \ + ++index; \ + } \ + while (index < gpu_array_size) \ + ; + +# define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size + +# include "kernel/integrator/integrator_shadow_state_template.h" + +# undef KERNEL_STRUCT_BEGIN +# undef KERNEL_STRUCT_MEMBER +# undef KERNEL_STRUCT_ARRAY_MEMBER +# undef KERNEL_STRUCT_END +# undef KERNEL_STRUCT_END_ARRAY +# undef KERNEL_STRUCT_VOLUME_STACK_SIZE +} + +ccl_device_inline void integrator_shadow_state_move(KernelGlobals kg, + ConstIntegratorState to_state, + ConstIntegratorState state) +{ + integrator_shadow_state_copy_only(kg, to_state, state); + + INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; +} + #endif /* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 4bdd8185ca6..5cbe2939dfc 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -1458,6 +1458,9 @@ typedef enum DeviceKernel { DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, + DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY, + DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES, DEVICE_KERNEL_INTEGRATOR_RESET, DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, |