From 8dd7b5b26b394207b5941d49750f7e3abadaf82a Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Mon, 24 Oct 2022 10:23:56 +0100 Subject: Cycles: Metal integrator state size tuning This patch tunes the integrator state sizing for Metal (`num_concurrent_states` and `num_concurrent_busy_states`). On all GPUs architecture, we adjust the busy:total states ratio to be 1:4 which gives better rendering performance than the previous 1:16 ratio (independent of total state count). This gives a small performance uplift (e.g. 2-3% on M1 Ultra). Additionally for M2 architectures, we double the overall state size if there is available headroom. Inclusive of the first change, we can expect uplift of close to 10% in future, as this results in larger dispatch sizes and minimises work submission overheads. In order to make an accurate determination of available headroom, we defer the calculation of `num_concurrent_states` and `num_concurrent_busy_states` until the time of integrator state allocation (i.e. after all of the scene data has been allocated). We also refactor `alloc_integrator_soa` to calculate an *exact* single-state-size in a first pass, right before allocating the integrator SoA buffers in a second pass. Reviewed By: brecht Differential Revision: https://developer.blender.org/D16313 --- intern/cycles/device/cuda/queue.cpp | 2 +- intern/cycles/device/cuda/queue.h | 2 +- intern/cycles/device/hip/queue.cpp | 2 +- intern/cycles/device/hip/queue.h | 2 +- intern/cycles/device/metal/device_impl.mm | 2 + intern/cycles/device/metal/kernel.mm | 7 ++++ intern/cycles/device/metal/queue.h | 2 +- intern/cycles/device/metal/queue.mm | 51 +++++++++++++++--------- intern/cycles/device/oneapi/queue.cpp | 2 +- intern/cycles/device/oneapi/queue.h | 2 +- intern/cycles/device/queue.h | 2 +- intern/cycles/integrator/path_trace_work_gpu.cpp | 31 +++++++++----- 12 files changed, 70 insertions(+), 37 deletions(-) diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index 84b0a1e0dd6..69fae03e32c 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -49,7 +49,7 @@ int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int CUDADeviceQueue::num_concurrent_busy_states() const +int CUDADeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = cuda_device_->get_num_multiprocessors() * cuda_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index b450f5b3592..7107afe70c9 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -23,7 +23,7 @@ class CUDADeviceQueue : public DeviceQueue { ~CUDADeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp index 3f8b6267100..e93a9b4df3a 100644 --- a/intern/cycles/device/hip/queue.cpp +++ b/intern/cycles/device/hip/queue.cpp @@ -49,7 +49,7 @@ int HIPDeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int HIPDeviceQueue::num_concurrent_busy_states() const +int HIPDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = hip_device_->get_num_multiprocessors() * hip_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h index 729d8a19acb..df0678108af 100644 --- a/intern/cycles/device/hip/queue.h +++ b/intern/cycles/device/hip/queue.h @@ -23,7 +23,7 @@ class HIPDeviceQueue : public DeviceQueue { ~HIPDeviceQueue(); virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 4b929b6bc0a..6f1042b1e55 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat } source = global_defines + source; +# if 0 metal_printf("================\n%s================\n\%s================\n", global_defines.c_str(), baked_constants.c_str()); +# endif /* Generate an MD5 from the source and include any baked constants. This is used when caching * PSOs. */ diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 8ccc50e57a3..55938d1a03a 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel, } } + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) { + if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) { + /* Skip shade_surface_mnee kernel if the scene doesn't require it. */ + return false; + } + } + if (pso_type != PSO_GENERIC) { /* Only specialize kernels where it can make an impact. */ if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index fc32740f3e1..2a6c12e2a60 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue { ~MetalDeviceQueue(); virtual int num_concurrent_states(const size_t) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t) const override; virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 5ac63a16c61..c0df2c8553f 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue() } } -int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - /* TODO: must have at least num_threads_per_block. */ - int result = 1048576; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; + static int result = 0; + if (result) { + return result; } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { + + result = 1048576; + if (metal_device_->device_vendor == METAL_GPU_APPLE) { result *= 4; + + if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) { + size_t system_ram = system_physical_ram(); + size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize]; + size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize]; + + /* Determine whether we can double the state count, and leave enough GPU-available memory + * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to + * keep dispatch sizes high and minimize work submission overheads. */ + size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024)); + size_t total_state_size = result * state_size; + if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) { + result *= 2; + metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result); + } + } + } + else if (metal_device_->device_vendor == METAL_GPU_AMD) { + /* METAL_WIP */ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + result *= 2; } return result; } -int MetalDeviceQueue::num_concurrent_busy_states() const +int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - int result = 65536; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; - } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { - result *= 4; - } - return result; + /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */ + return num_concurrent_states(state_size) / 4; } int MetalDeviceQueue::num_sort_partition_elements() const diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp index 9632b14d485..3d019661aa8 100644 --- a/intern/cycles/device/oneapi/queue.cpp +++ b/intern/cycles/device/oneapi/queue.cpp @@ -43,7 +43,7 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const return num_states; } -int OneapiDeviceQueue::num_concurrent_busy_states() const +int OneapiDeviceQueue::num_concurrent_busy_states(const size_t /*state_size*/) const { const int max_num_threads = oneapi_device_->get_num_multiprocessors() * oneapi_device_->get_max_num_threads_per_multiprocessor(); diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h index 32363bf2a6e..bbd947b49cb 100644 --- a/intern/cycles/device/oneapi/queue.h +++ b/intern/cycles/device/oneapi/queue.h @@ -25,7 +25,7 @@ class OneapiDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t state_size) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t state_size) const override; virtual void init_execution() override; diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h index 1d6a8d736b7..e27e081a407 100644 --- a/intern/cycles/device/queue.h +++ b/intern/cycles/device/queue.h @@ -103,7 +103,7 @@ class DeviceQueue { /* Number of states which keeps the device occupied with work without losing performance. * The renderer will add more work (when available) when number of active paths falls below this * value. */ - virtual int num_concurrent_busy_states() const = 0; + virtual int num_concurrent_busy_states(const size_t state_size) const = 0; /* Number of elements in a partition of sorted shaders, that improves memory locality of * integrator state fetch at the cost of decreased coherence for shader kernel execution. */ diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index ee250a6916b..48f6cf3c903 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -18,13 +18,15 @@ CCL_NAMESPACE_BEGIN -static size_t estimate_single_state_size() +static size_t estimate_single_state_size(const uint kernel_features) { size_t state_size = 0; #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) { -#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type); -#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type); +#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \ + state_size += (kernel_features & (feature)) ? sizeof(type) : 0; +#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \ + state_size += (kernel_features & (feature)) ? sizeof(type) : 0; #define KERNEL_STRUCT_END(name) \ break; \ } @@ -76,16 +78,11 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device, num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE), work_tiles_(device, "work_tiles", MEM_READ_WRITE), display_rgba_half_(device, "display buffer half", MEM_READ_WRITE), - max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())), - min_num_active_main_paths_(queue_->num_concurrent_busy_states()), + max_num_paths_(0), + min_num_active_main_paths_(0), max_active_main_path_index_(0) { memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_)); - - /* Limit number of active paths to the half of the overall state. This is due to the logic in the - * path compaction which relies on the fact that regeneration does not happen sooner than half of - * the states are available again. */ - min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2); } void PathTraceWorkGPU::alloc_integrator_soa() @@ -103,6 +100,20 @@ void PathTraceWorkGPU::alloc_integrator_soa() integrator_state_soa_volume_stack_size_ = max(integrator_state_soa_volume_stack_size_, requested_volume_stack_size); + /* Deterine the number of path states. Deferring this for as long as possible allows the backend + * to make better decisions about memory availability. */ + if (max_num_paths_ == 0) { + size_t single_state_size = estimate_single_state_size(kernel_features); + + max_num_paths_ = queue_->num_concurrent_states(single_state_size); + min_num_active_main_paths_ = queue_->num_concurrent_busy_states(single_state_size); + + /* Limit number of active paths to the half of the overall state. This is due to the logic in + * the path compaction which relies on the fact that regeneration does not happen sooner than + * half of the states are available again. */ + min_num_active_main_paths_ = min(min_num_active_main_paths_, max_num_paths_ / 2); + } + /* Allocate a device only memory buffer before for each struct member, and then * write the pointers into a struct that resides in constant memory. * -- cgit v1.2.3