diff options
author | Michael Jones <michael_p_jones@apple.com> | 2022-07-13 22:56:57 +0300 |
---|---|---|
committer | Michael Jones <michael_p_jones@apple.com> | 2022-07-14 16:26:18 +0300 |
commit | 4b1d315017ef103f3034160d349b3c3c21a4cd6a (patch) | |
tree | 779dd8c27d37e710d3014911e962027b56049084 /intern/cycles/integrator/path_trace_work_gpu.cpp | |
parent | 47d4ce498e3f5a11a0210b1efd57053f0b1c85bd (diff) |
Cycles: Improve cache usage on Apple GPUs by chunking active indices
This patch partitions the active indices into chunks prior to sorting by material in order to tradeoff some material coherence for better locality. On Apple Silicon GPUs (particularly higher end M1-family GPUs), we observe overall render time speedups of up to 15%. The partitioning is implemented by repeating the range of `shader_sort_key` for each partition, and encoding a "locator" key which distributes the indices into sorted chunks.
Reviewed By: brecht
Differential Revision: https://developer.blender.org/D15331
Diffstat (limited to 'intern/cycles/integrator/path_trace_work_gpu.cpp')
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 21 |
1 files changed, 13 insertions, 8 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index e262c252ce3..d51e8a28bb4 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -182,18 +182,19 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { /* Allocate arrays for shader sorting. */ - const int max_shaders = device_scene_->data.max_shaders; - if (integrator_shader_sort_counter_.size() < max_shaders) { - integrator_shader_sort_counter_.alloc(max_shaders); + num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size()); + const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; + if (integrator_shader_sort_counter_.size() < sort_buckets) { + integrator_shader_sort_counter_.alloc(sort_buckets); integrator_shader_sort_counter_.zero_to_device(); - integrator_shader_raytrace_sort_counter_.alloc(max_shaders); + integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); integrator_shader_raytrace_sort_counter_.zero_to_device(); - integrator_shader_mnee_sort_counter_.alloc(max_shaders); + integrator_shader_mnee_sort_counter_.alloc(sort_buckets); integrator_shader_mnee_sort_counter_.zero_to_device(); - integrator_shader_sort_prefix_sum_.alloc(max_shaders); + integrator_shader_sort_prefix_sum_.alloc(sort_buckets); integrator_shader_sort_prefix_sum_.zero_to_device(); integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = @@ -237,6 +238,10 @@ void PathTraceWorkGPU::init_execution() { queue_->init_execution(); + /* Setup sort partitioning divisor for better cache utilization. */ + integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, + num_sort_partitions_); + /* Copy to device side struct in constant memory. */ device_->const_copy_to( "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); @@ -486,9 +491,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, /* Compute prefix sum of number of active paths with each shader. */ { const int work_size = 1; - int max_shaders = device_scene_->data.max_shaders; + int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; - DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders); + DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets); queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); } |