diff options
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 6 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.h | 1 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 21 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.h | 1 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.mm | 18 | ||||
-rw-r--r-- | intern/cycles/device/queue.h | 8 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.cpp | 21 | ||||
-rw-r--r-- | intern/cycles/integrator/path_trace_work_gpu.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/integrator/state.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/integrator/state_flow.h | 12 |
11 files changed, 82 insertions, 14 deletions
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 87c83242240..ba9317e3204 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -217,6 +217,10 @@ string MetalDevice::get_source(const uint kernel_features) build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); } + if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) { + build_options += " -D__KERNEL_SORT_PARTITIONING__ "; + } + if (use_metalrt) { build_options += "-D__METALRT__ "; if (motion_blur) { @@ -652,7 +656,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size) /* Update data storage pointers in launch parameters. */ if (strcmp(name, "integrator_state") == 0) { /* IntegratorStateGPU is contiguous pointers */ - const size_t pointer_block_size = sizeof(IntegratorStateGPU); + const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor); update_launch_pointers( offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size); } diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index b0bd487c86d..836289172f7 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; + virtual int num_sort_partitions(const size_t) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 03e60b6bb6e..6a9cc552098 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -293,6 +293,23 @@ int MetalDeviceQueue::num_concurrent_busy_states() const return result; } +int MetalDeviceQueue::num_sort_partitions(const size_t state_size) const +{ + /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a + * more sophisticated heuristic we simply disable sort partitioning if the shader count is high. + */ + if (metal_device_->launch_params.data.max_shaders >= 300) { + return 1; + } + + const int optimal_partition_elements = MetalInfo::optimal_sort_partition_elements( + metal_device_->mtlDevice); + if (optimal_partition_elements) { + return num_concurrent_states(state_size) / optimal_partition_elements; + } + return 1; +} + void MetalDeviceQueue::init_execution() { /* Synchronize all textures and memory copies before executing task. */ @@ -359,7 +376,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) + - sizeof(IntegratorStateGPU); + offsetof(IntegratorStateGPU, sort_partition_divisor); size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset, @@ -416,7 +433,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) + - sizeof(IntegratorStateGPU); + offsetof(IntegratorStateGPU, sort_partition_divisor); for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) { int pointer_index = int(offset / sizeof(device_ptr)); MetalDevice::MetalMem *mmem = *( diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index fd32d8a260f..a988d01d361 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -37,6 +37,7 @@ struct MetalInfo { static int get_apple_gpu_core_count(id<MTLDevice> device); static MetalGPUVendor get_device_vendor(id<MTLDevice> device); static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device); + static int optimal_sort_partition_elements(id<MTLDevice> device); static string get_device_name(id<MTLDevice> device); }; diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index a7a5b596b8f..c336dc310c8 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -72,6 +72,24 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device) return METAL_GPU_UNKNOWN; } +int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device) +{ + if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) { + return atoi(str); + } + + /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before + * sorting each partition by material. Partitioning into chunks of 65536 elements results in an + * overall render time speedup of up to 15%. */ + if (get_device_vendor(device) == METAL_GPU_APPLE) { + AppleGPUArchitecture arch = get_apple_gpu_architecture(device); + if (arch == APPLE_M1 || arch == APPLE_M2) { + return 65536; + } + } + return 0; +} + vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() { static vector<id<MTLDevice>> usable_devices; diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 14a5db3a204..20308e4a106 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -105,6 +105,14 @@ class DeviceQueue { * value. */ virtual int num_concurrent_busy_states() const = 0; + /* Number of partitions within which active indices are sorted by material ID. + * Using more partitions lets us trade off material coherence for better integrator state fetch + * locality. */ + virtual int num_sort_partitions(const size_t /*state_size*/) const + { + return 1; + } + /* Initialize execution of kernels on this queue. * * Will, for example, load all data required by the kernels from Device to global or path state. diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index 6912bf928cd..ed278821b46 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -373,7 +373,7 @@ void PathTrace::path_trace(RenderWork &render_work) work_balance_infos_[i].time_spent += work_time; work_balance_infos_[i].occupancy = statistics.occupancy; - VLOG_WORK << "Rendered " << num_samples << " samples in " << work_time << " seconds (" + VLOG_INFO << "Rendered " << num_samples << " samples in " << work_time << " seconds (" << work_time / num_samples << " seconds per sample), occupancy: " << statistics.occupancy; }); diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index e262c252ce3..d51e8a28bb4 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -182,18 +182,19 @@ void PathTraceWorkGPU::alloc_integrator_queue() void PathTraceWorkGPU::alloc_integrator_sorting() { /* Allocate arrays for shader sorting. */ - const int max_shaders = device_scene_->data.max_shaders; - if (integrator_shader_sort_counter_.size() < max_shaders) { - integrator_shader_sort_counter_.alloc(max_shaders); + num_sort_partitions_ = queue_->num_sort_partitions(estimate_single_state_size()); + const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; + if (integrator_shader_sort_counter_.size() < sort_buckets) { + integrator_shader_sort_counter_.alloc(sort_buckets); integrator_shader_sort_counter_.zero_to_device(); - integrator_shader_raytrace_sort_counter_.alloc(max_shaders); + integrator_shader_raytrace_sort_counter_.alloc(sort_buckets); integrator_shader_raytrace_sort_counter_.zero_to_device(); - integrator_shader_mnee_sort_counter_.alloc(max_shaders); + integrator_shader_mnee_sort_counter_.alloc(sort_buckets); integrator_shader_mnee_sort_counter_.zero_to_device(); - integrator_shader_sort_prefix_sum_.alloc(max_shaders); + integrator_shader_sort_prefix_sum_.alloc(sort_buckets); integrator_shader_sort_prefix_sum_.zero_to_device(); integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = @@ -237,6 +238,10 @@ void PathTraceWorkGPU::init_execution() { queue_->init_execution(); + /* Setup sort partitioning divisor for better cache utilization. */ + integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_, + num_sort_partitions_); + /* Copy to device side struct in constant memory. */ device_->const_copy_to( "integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_)); @@ -486,9 +491,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, /* Compute prefix sum of number of active paths with each shader. */ { const int work_size = 1; - int max_shaders = device_scene_->data.max_shaders; + int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_; - DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders); + DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets); queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args); } diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index 4c10a221a30..a805258d1b5 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -156,6 +156,9 @@ class PathTraceWorkGPU : public PathTraceWork { bool interop_use_checked_ = false; bool interop_use_ = false; + /* Number of partitions to sort state indices into prior to material sort. */ + int num_sort_partitions_; + /* Maximum number of concurrent integrator states. */ int max_num_paths_; diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h index d6fef27f344..d10d31e930e 100644 --- a/intern/cycles/kernel/integrator/state.h +++ b/intern/cycles/kernel/integrator/state.h @@ -127,6 +127,9 @@ typedef struct IntegratorStateGPU { /* Index of main path which will be used by a next shadow catcher split. */ ccl_global int *next_main_path_index; + + /* Divisor used to partition active indices by locality when sorting by material. */ + uint sort_partition_divisor; } IntegratorStateGPU; /* Abstraction diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h index fed74d49434..d397ef385e7 100644 --- a/intern/cycles/kernel/integrator/state_flow.h +++ b/intern/cycles/kernel/integrator/state_flow.h @@ -67,9 +67,17 @@ CCL_NAMESPACE_BEGIN &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; +# ifdef __KERNEL_SORT_PARTITIONING__ +/* Sort first by truncated state index (for good locality), then by key (for good coherence). */ +# define INTEGRATOR_SORT_KEY(key, state) \ + (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor)) +# else +# define INTEGRATOR_SORT_KEY(key, state) (key) +# endif + # define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \ { \ - const int key_ = key; \ + const int key_ = INTEGRATOR_SORT_KEY(key, state); \ atomic_fetch_and_add_uint32( \ &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \ INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \ @@ -79,7 +87,7 @@ CCL_NAMESPACE_BEGIN } # define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \ { \ - const int key_ = key; \ + const int key_ = INTEGRATOR_SORT_KEY(key, state); \ atomic_fetch_and_sub_uint32( \ &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \ atomic_fetch_and_add_uint32( \ |