diff options
author | Brecht Van Lommel <brecht@blender.org> | 2022-07-14 17:42:43 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2022-07-15 14:42:47 +0300 |
commit | 523bbf7065547a67e7c23f67f546a5ed6433f809 (patch) | |
tree | a054c6619cc5a1c1f330e65db92e45d4ded515c2 /intern/cycles/integrator | |
parent | da4ef05e4dfb700a61910e6d8e02183d7c272963 (diff) |
Cycles: generalize shader sorting / locality heuristic to all GPU devices
This was added for Metal, but also gives good results with CUDA and OptiX.
Also enable it for future Apple GPUs instead of only M1 and M2, since this has
been shown to help across multiple GPUs so the better bet seems to enable
rather than disable it.
Also moves some of the logic outside of the Metal device code, and always
enables the code in the kernel since other devices don't do dynamic compile.
Time per sample with OptiX + RTX A6000:
new old
barbershop_interior 0.0730s 0.0727s
bmw27 0.0047s 0.0053s
classroom 0.0428s 0.0464s
fishy_cat 0.0102s 0.0108s
junkshop 0.0366s 0.0395s
koro 0.0567s 0.0578s
monster 0.0206s 0.0223s
pabellon 0.0158s 0.0174s
sponza 0.0088s 0.0100s
spring 0.1267s 0.1280s
victor 0.0524s 0.0531s
wdas_cloud 0.0817s 0.0816s
Ref D15331, T87836
Diffstat (limited to 'intern/cycles/integrator')
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 53 |
1 files changed, 35 insertions, 18 deletions
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index d51e8a28bb4..fa313f6460a 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -181,28 +181,45 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { + /* Compute sort partitions, to balance between memory locality and coherence. + * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a + * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. + */ + num_sort_partitions_ = 1; + if (device_scene_->data.max_shaders < 300) { + const int num_elements = queue_->num_sort_partition_elements(); + if (num_elements) { + num_sort_partitions_ = max(max_num_paths_ / num_elements, 1); + } + } + + integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, + num_sort_partitions_); + /* Allocate arrays for shader sorting. */ - num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size()); const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; if (integrator_shader_sort_counter_.size() < sort_buckets) { integrator_shader_sort_counter_.alloc(sort_buckets); integrator_shader_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = + (int *)integrator_shader_sort_counter_.device_pointer; - integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); - integrator_shader_raytrace_sort_counter_.zero_to_device(); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); + integrator_shader_raytrace_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = + (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + } - integrator_shader_mnee_sort_counter_.alloc(sort_buckets); - integrator_shader_mnee_sort_counter_.zero_to_device(); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + integrator_shader_mnee_sort_counter_.alloc(sort_buckets); + integrator_shader_mnee_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = + (int *)integrator_shader_mnee_sort_counter_.device_pointer; + } integrator_shader_sort_prefix_sum_.alloc(sort_buckets); integrator_shader_sort_prefix_sum_.zero_to_device(); - - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = - (int *)integrator_shader_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = - (int *)integrator_shader_raytrace_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = - (int *)integrator_shader_mnee_sort_counter_.device_pointer; } } @@ -238,10 +255,6 @@ void PathTraceWorkGPU::init_execution() { queue_->init_execution(); - /* Setup sort partitioning divisor for better cache utilization. */ - integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, - num_sort_partitions_); - /* Copy to device side struct in constant memory. */ device_->const_copy_to( "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); @@ -338,8 +351,12 @@ void PathTraceWorkGPU::enqueue_reset() queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); queue_->zero_to_device(integrator_queue_counter_); queue_->zero_to_device(integrator_shader_sort_counter_); - queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); - queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + } + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + } /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the * counter on the host side because `zero_to_device()` is not doing it. */ |