diff options
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 4 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 16 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.mm | 5 | ||||
-rw-r--r-- | intern/cycles/device/queue.h | 9 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 53 | ||||
-rw-r--r-- | intern/cycles/kernel/integrator/state_flow.h | 8 |
7 files changed, 45 insertions, 52 deletions
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index d8bb3b867cd..d1250b83d22 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -229,10 +229,6 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n"; } - if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) { - build_options += " -D__KERNEL_SORT_PARTITIONING__ "; - } - if (use_metalrt) { global_defines += "#define __METALRT__\n"; if (motion_blur) { diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index 836289172f7..fc32740f3e1 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,7 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; - virtual int num_sort_partitions(const size_t) const override; + virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 6a9cc552098..5ac63a16c61 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -293,21 +293,9 @@ int MetalDeviceQueue::num_concurrent_busy_states() const return result; } -int MetalDeviceQueue::num_sort_partitions(const size_t state_size) const +int MetalDeviceQueue::num_sort_partition_elements() const { - /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a - * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. - */ - if (metal_device_->launch_params.data.max_shaders >= 300) { - return 1; - } - - const int optimal_partition_elements = MetalInfo::optimal_sort_partition_elements( - metal_device_->mtlDevice); - if (optimal_partition_elements) { - return num_concurrent_states(state_size) / optimal_partition_elements; - } - return 1; + return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice); } void MetalDeviceQueue::init_execution() diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index c336dc310c8..65c67c400fe 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -82,10 +82,7 @@ int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device) * sorting each partition by material. Partitioning into chunks of 65536 elements results in an * overall render time speedup of up to 15%. */ if (get_device_vendor(device) == METAL_GPU_APPLE) { - AppleGPUArchitecture arch = get_apple_gpu_architecture(device); - if (arch == APPLE_M1 || arch == APPLE_M2) { - return 65536; - } + return 65536; } return 0; } diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 20308e4a106..808431af401 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -105,12 +105,11 @@ class DeviceQueue { * value. */ virtual int num_concurrent_busy_states() const = 0; - /* Number of partitions within which active indices are sorted by material ID. - * Using more partitions lets us trade off material coherence for better integrator state fetch - * locality. */ - virtual int num_sort_partitions(const size_t /*state_size*/) const + /* Number of elements in a partition of sorted shaders, that improves memory locality of + * integrator state fetch at the cost of decreased coherence for shader kernel execution. */ + virtual int num_sort_partition_elements() const { - return 1; + return 65536; } /* Initialize execution of kernels on this queue. diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index d51e8a28bb4..fa313f6460a 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -181,28 +181,45 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { + /* Compute sort partitions, to balance between memory locality and coherence. + * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a + * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. + */ + num_sort_partitions_ = 1; + if (device_scene_->data.max_shaders < 300) { + const int num_elements = queue_->num_sort_partition_elements(); + if (num_elements) { + num_sort_partitions_ = max(max_num_paths_ / num_elements, 1); + } + } + + integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, + num_sort_partitions_); + /* Allocate arrays for shader sorting. */ - num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size()); const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; if (integrator_shader_sort_counter_.size() < sort_buckets) { integrator_shader_sort_counter_.alloc(sort_buckets); integrator_shader_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = + (int *)integrator_shader_sort_counter_.device_pointer; - integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); - integrator_shader_raytrace_sort_counter_.zero_to_device(); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); + integrator_shader_raytrace_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = + (int *)integrator_shader_raytrace_sort_counter_.device_pointer; + } - integrator_shader_mnee_sort_counter_.alloc(sort_buckets); - integrator_shader_mnee_sort_counter_.zero_to_device(); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + integrator_shader_mnee_sort_counter_.alloc(sort_buckets); + integrator_shader_mnee_sort_counter_.zero_to_device(); + integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = + (int *)integrator_shader_mnee_sort_counter_.device_pointer; + } integrator_shader_sort_prefix_sum_.alloc(sort_buckets); integrator_shader_sort_prefix_sum_.zero_to_device(); - - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = - (int *)integrator_shader_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] = - (int *)integrator_shader_raytrace_sort_counter_.device_pointer; - integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] = - (int *)integrator_shader_mnee_sort_counter_.device_pointer; } } @@ -238,10 +255,6 @@ void PathTraceWorkGPU::init_execution() { queue_->init_execution(); - /* Setup sort partitioning divisor for better cache utilization. */ - integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, - num_sort_partitions_); - /* Copy to device side struct in constant memory. */ device_->const_copy_to( "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); @@ -338,8 +351,12 @@ void PathTraceWorkGPU::enqueue_reset() queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args); queue_->zero_to_device(integrator_queue_counter_); queue_->zero_to_device(integrator_shader_sort_counter_); - queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); - queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) { + queue_->zero_to_device(integrator_shader_raytrace_sort_counter_); + } + if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) { + queue_->zero_to_device(integrator_shader_mnee_sort_counter_); + } /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the * counter on the host side because `zero_to_device()` is not doing it. */ diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h index 1ae746022d0..4b03c665e17 100644 --- a/intern/cycles/kernel/integrator/state_flow.h +++ b/intern/cycles/kernel/integrator/state_flow.h @@ -99,13 +99,9 @@ ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg, INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; } -# ifdef __KERNEL_SORT_PARTITIONING__ /* Sort first by truncated state index (for good locality), then by key (for good coherence). */ -# define INTEGRATOR_SORT_KEY(key, state) \ - (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor)) -# else -# define INTEGRATOR_SORT_KEY(key, state) (key) -# endif +# define INTEGRATOR_SORT_KEY(key, state) \ + (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor)) ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg, IntegratorState state, |