diff options
Diffstat (limited to 'intern/cycles/device/metal')
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 10 | ||||
-rw-r--r-- | intern/cycles/device/metal/kernel.mm | 13 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 51 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.mm | 6 |
5 files changed, 62 insertions, 20 deletions
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 6a16d4bb3b4..6f1042b1e55 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -296,9 +296,11 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat } source = global_defines + source; +# if 0 metal_printf("================\n%s================\n\%s================\n", global_defines.c_str(), baked_constants.c_str()); +# endif /* Generate an MD5 from the source and include any baked constants. This is used when caching * PSOs. */ @@ -339,6 +341,14 @@ bool MetalDevice::compile_and_load(MetalPipelineType pso_type) MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; +# if defined(MAC_OS_VERSION_13_0) + if (@available(macos 13.0, *)) { + if (device_vendor == METAL_GPU_INTEL) { + [options setOptimizationLevel:MTLLibraryOptimizationLevelSize]; + } + } +# endif + options.fastMathEnabled = YES; if (@available(macOS 12.0, *)) { options.languageVersion = MTLLanguageVersion2_4; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 5e0cb6d18f4..55938d1a03a 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -162,6 +162,13 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel, } } + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) { + if ((device->kernel_features & KERNEL_FEATURE_MNEE) == 0) { + /* Skip shade_surface_mnee kernel if the scene doesn't require it. */ + return false; + } + } + if (pso_type != PSO_GENERIC) { /* Only specialize kernels where it can make an impact. */ if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || @@ -317,6 +324,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const } } + /* Workaround for Intel GPU having issue using Binary Archives */ + MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice); + if (gpu_vendor == METAL_GPU_INTEL) { + return false; + } + if (pso_type == PSO_GENERIC) { /* Archive the generic kernels. */ return true; diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index fc32740f3e1..2a6c12e2a60 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -23,7 +23,7 @@ class MetalDeviceQueue : public DeviceQueue { ~MetalDeviceQueue(); virtual int num_concurrent_states(const size_t) const override; - virtual int num_concurrent_busy_states() const override; + virtual int num_concurrent_busy_states(const size_t) const override; virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 5ac63a16c61..c0df2c8553f 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -264,33 +264,46 @@ MetalDeviceQueue::~MetalDeviceQueue() } } -int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - /* TODO: must have at least num_threads_per_block. */ - int result = 1048576; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; + static int result = 0; + if (result) { + return result; } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { + + result = 1048576; + if (metal_device_->device_vendor == METAL_GPU_APPLE) { result *= 4; + + if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) == APPLE_M2) { + size_t system_ram = system_physical_ram(); + size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize]; + size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize]; + + /* Determine whether we can double the state count, and leave enough GPU-available memory + * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to + * keep dispatch sizes high and minimize work submission overheads. */ + size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024)); + size_t total_state_size = result * state_size; + if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) { + result *= 2; + metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result); + } + } + } + else if (metal_device_->device_vendor == METAL_GPU_AMD) { + /* METAL_WIP */ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + result *= 2; } return result; } -int MetalDeviceQueue::num_concurrent_busy_states() const +int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const { - /* METAL_WIP */ - /* TODO: compute automatically. */ - int result = 65536; - if (metal_device_->device_vendor == METAL_GPU_AMD) { - result *= 2; - } - else if (metal_device_->device_vendor == METAL_GPU_APPLE) { - result *= 4; - } - return result; + /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */ + return num_concurrent_states(state_size) / 4; } int MetalDeviceQueue::num_sort_partition_elements() const diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index 65c67c400fe..f47638fac15 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -110,6 +110,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() usable |= (vendor == METAL_GPU_AMD); } +# if defined(MAC_OS_VERSION_13_0) + if (@available(macos 13.0, *)) { + usable |= (vendor == METAL_GPU_INTEL); + } +# endif + if (usable) { metal_printf("- %s\n", device_name.c_str()); [device retain]; |