diff options
Diffstat (limited to 'intern/cycles/device/metal')
-rw-r--r-- | intern/cycles/device/metal/device.mm | 14 | ||||
-rw-r--r-- | intern/cycles/device/metal/device_impl.h | 12 | ||||
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 231 | ||||
-rw-r--r-- | intern/cycles/device/metal/kernel.h | 30 | ||||
-rw-r--r-- | intern/cycles/device/metal/kernel.mm | 221 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.h | 1 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 21 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.h | 12 | ||||
-rw-r--r-- | intern/cycles/device/metal/util.mm | 74 |
9 files changed, 451 insertions, 165 deletions
diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm index d7f190fc01e..51e3323370a 100644 --- a/intern/cycles/device/metal/device.mm +++ b/intern/cycles/device/metal/device.mm @@ -34,7 +34,8 @@ void device_metal_info(vector<DeviceInfo> &devices) int device_index = 0; for (id<MTLDevice> &device : usable_devices) { /* Compute unique ID for persistent user preferences. */ - string device_name = [device.name UTF8String]; + string device_name = MetalInfo::get_device_name(device); + string id = string("METAL_") + device_name; /* Hardware ID might not be unique, add device number in that case. */ @@ -48,12 +49,6 @@ void device_metal_info(vector<DeviceInfo> &devices) info.type = DEVICE_METAL; info.description = string_remove_trademark(string(device_name)); - /* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and - * GPU */ - if (info.description == system_cpu_brand_string()) { - info.description += " (GPU)"; - } - info.num = device_index; /* We don't know if it's used for display, but assume it is. */ info.display_device = true; @@ -69,14 +64,15 @@ string device_metal_capabilities() { string result = ""; auto allDevices = MTLCopyAllDevices(); - uint32_t num_devices = allDevices.count; + uint32_t num_devices = (uint32_t)allDevices.count; if (num_devices == 0) { return "No Metal devices found\n"; } result += string_printf("Number of devices: %u\n", num_devices); for (id<MTLDevice> device in allDevices) { - result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]); + string device_name = MetalInfo::get_device_name(device); + result += string_printf("\t\tDevice: %s\n", device_name.c_str()); } return result; diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index 0e6817d94f8..99e60d3a788 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -42,7 +42,6 @@ class MetalDevice : public Device { nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */ /*---------------------------------------------------*/ - string device_name; MetalGPUVendor device_vendor; uint kernel_features; @@ -76,7 +75,8 @@ class MetalDevice : public Device { std::vector<id<MTLTexture>> texture_slot_map; bool use_metalrt = false; - bool use_function_specialisation = false; + MetalPipelineType kernel_specialization_level = PSO_GENERIC; + std::atomic_bool async_compile_and_load = false; virtual BVHLayoutMask get_bvh_layout_mask() const override; @@ -92,9 +92,7 @@ class MetalDevice : public Device { bool use_adaptive_compilation(); - string get_source(const uint kernel_features); - - string compile_kernel(const uint kernel_features, const char *name); + void make_source(MetalPipelineType pso_type, const uint kernel_features); virtual bool load_kernels(const uint kernel_features) override; @@ -112,7 +110,9 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - id<MTLLibrary> compile(string const &source); + virtual void optimize_for_scene(Scene *scene) override; + + bool compile_and_load(MetalPipelineType pso_type); /* ------------------------------------------------------------------ */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 086bf0af979..d1250b83d22 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -6,9 +6,12 @@ # include "device/metal/device_impl.h" # include "device/metal/device.h" +# include "scene/scene.h" + # include "util/debug.h" # include "util/md5.h" # include "util/path.h" +# include "util/time.h" CCL_NAMESPACE_BEGIN @@ -35,7 +38,7 @@ void MetalDevice::set_error(const string &error) } MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) + : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) { mtlDevId = info.num; @@ -43,10 +46,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile auto usable_devices = MetalInfo::get_usable_devices(); assert(mtlDevId < usable_devices.size()); mtlDevice = usable_devices[mtlDevId]; - device_name = [mtlDevice.name UTF8String]; - device_vendor = MetalInfo::get_vendor_from_device_name(device_name); + device_vendor = MetalInfo::get_device_vendor(mtlDevice); assert(device_vendor != METAL_GPU_UNKNOWN); - metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str()); + metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str()); /* determine default storage mode based on whether UMA is supported */ @@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; use_metalrt = info.use_metalrt; + + /* Specialize the intersection kernels on Apple GPUs by default as these can be built very + * quickly. */ + kernel_specialization_level = PSO_SPECIALIZED_INTERSECT; break; } } @@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile capture_enabled = true; } + if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) { + kernel_specialization_level = (MetalPipelineType)atoi(envstr); + } + metal_printf("kernel_specialization_level = %s\n", + kernel_type_as_string( + (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1))); + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation() return DebugFlags().metal.adaptive_compile; } -string MetalDevice::get_source(const uint kernel_features) +void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features) { - string build_options; - + string global_defines; if (use_adaptive_compilation()) { - build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n"; } if (use_metalrt) { - build_options += "-D__METALRT__ "; + global_defines += "#define __METALRT__\n"; if (motion_blur) { - build_options += "-D__METALRT_MOTION__ "; + global_defines += "#define __METALRT_MOTION__\n"; } } # ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; + global_defines += "#define __KERNEL_DEBUG__\n"; # endif switch (device_vendor) { default: break; case METAL_GPU_INTEL: - build_options += "-D__KERNEL_METAL_INTEL__ "; + global_defines += "#define __KERNEL_METAL_INTEL__\n"; break; case METAL_GPU_AMD: - build_options += "-D__KERNEL_METAL_AMD__ "; + global_defines += "#define __KERNEL_METAL_AMD__\n"; break; case METAL_GPU_APPLE: - build_options += "-D__KERNEL_METAL_APPLE__ "; + global_defines += "#define __KERNEL_METAL_APPLE__\n"; break; } - /* reformat -D defines list into compilable form */ - vector<string> components; - string_replace(build_options, "-D", ""); - string_split(components, build_options, " "); + string &source = this->source[pso_type]; + source = "\n#include \"kernel/device/metal/kernel.metal\"\n"; + source = path_source_replace_includes(source, path_get("source")); - string globalDefines; - for (const string &component : components) { - vector<string> assignments; - string_split(assignments, component, "="); - if (assignments.size() == 2) - globalDefines += string_printf( - "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str()); - else - globalDefines += string_printf("#define %s\n", assignments[0].c_str()); + /* Perform any required specialization on the source. + * With Metal function constants we can generate a single variant of the kernel source which can + * be repeatedly respecialized. + */ + string baked_constants; + + /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of + * the same character length. Build a string of all active constant values which is then hashed + * in order to identify the PSO. + */ + if (pso_type != PSO_GENERIC) { + const double starttime = time_dt(); + +# define KERNEL_STRUCT_BEGIN(name, parent) \ + string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_"); + + /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */ +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + baked_constants += string(#parent "." #name "=") + \ + to_string(_type(launch_params.data.parent.name)) + "\n"; + +# include "kernel/data_template.h" + + /* Opt in to all of available specializations. This can be made more granular for the + * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests, + * but the overhead should be negligible as these are very quick to (re)build and aren't + * serialized to disk via MTLBinaryArchives. + */ + global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n"; + + metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0); } - string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n"; - source = path_source_replace_includes(source, path_get("source")); - - metal_printf("Global defines:\n%s\n", globalDefines.c_str()); + source = global_defines + source; + metal_printf("================\n%s================\n\%s================\n", + global_defines.c_str(), + baked_constants.c_str()); - return source; + /* Generate an MD5 from the source and include any baked constants. This is used when caching + * PSOs. */ + MD5Hash md5; + md5.append(baked_constants); + md5.append(source); + source_md5[pso_type] = md5.get_hex(); } bool MetalDevice::load_kernels(const uint _kernel_features) @@ -279,24 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - source[PSO_GENERIC] = get_source(kernel_features); - mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - - MD5Hash md5; - md5.append(source[PSO_GENERIC]); - source_md5[PSO_GENERIC] = md5.get_hex(); - - metal_printf("Front-end compilation finished (generic)\n"); - - bool result = MetalDeviceKernels::load(this, false); + bool result = compile_and_load(PSO_GENERIC); reserve_local_memory(kernel_features); - return result; } -id<MTLLibrary> MetalDevice::compile(string const &source) +bool MetalDevice::compile_and_load(MetalPipelineType pso_type) { + make_source(pso_type, kernel_features); + + if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) { + /* We already have a full set of matching pipelines which are cached or queued. */ + metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type)); + return true; + } + MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; @@ -304,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source) options.languageVersion = MTLLanguageVersion2_4; } + if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) { + path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))), + source[pso_type]); + } + + const double starttime = time_dt(); + NSError *error = NULL; - id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) - options:options - error:&error]; + mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str()) + options:options + error:&error]; - if (!mtlLibrary) { + if (!mtlLibrary[pso_type]) { NSString *err = [error localizedDescription]; set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); } + metal_printf("Front-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); + [options release]; - return mtlLibrary; + return MetalDeviceKernels::load(this, pso_type); } void MetalDevice::reserve_local_memory(const uint kernel_features) @@ -411,9 +458,9 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem) } if (mem.name) { - VLOG(2) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; } mem.device_size = metal_buffer.allocatedSize; @@ -623,11 +670,63 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz return 0; } +void MetalDevice::optimize_for_scene(Scene *scene) +{ + MetalPipelineType specialization_level = kernel_specialization_level; + + if (specialization_level < PSO_SPECIALIZED_INTERSECT) { + return; + } + + /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them + * synchronously. */ + compile_and_load(PSO_SPECIALIZED_INTERSECT); + + if (specialization_level < PSO_SPECIALIZED_SHADE) { + return; + } + if (!scene->params.background) { + /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to + * build. */ + return; + } + + /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and + * only if there isn't an existing load in flight. + */ + auto specialize_shade_fn = ^() { + compile_and_load(PSO_SPECIALIZED_SHADE); + async_compile_and_load = false; + }; + + bool async_specialize_shade = true; + + /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */ + if (getenv("CYCLES_METAL_PROFILING") != nullptr) { + async_specialize_shade = false; + } + + if (async_specialize_shade) { + if (!async_compile_and_load) { + async_compile_and_load = true; + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), + specialize_shade_fn); + } + else { + metal_printf( + "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n"); + } + } + else { + specialize_shade_fn(); + } +} + void MetalDevice::const_copy_to(const char *name, void *host, size_t size) { - if (strcmp(name, "__data") == 0) { + if (strcmp(name, "data") == 0) { assert(size == sizeof(KernelData)); - memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size); + memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData)); return; } @@ -646,19 +745,19 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size) }; /* Update data storage pointers in launch parameters. */ - if (strcmp(name, "__integrator_state") == 0) { + if (strcmp(name, "integrator_state") == 0) { /* IntegratorStateGPU is contiguous pointers */ - const size_t pointer_block_size = sizeof(IntegratorStateGPU); + const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor); update_launch_pointers( - offsetof(KernelParamsMetal, __integrator_state), host, size, pointer_block_size); + offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size); } -# define KERNEL_TEX(data_type, tex_name) \ +# define KERNEL_DATA_ARRAY(data_type, tex_name) \ else if (strcmp(name, #tex_name) == 0) \ { \ update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \ } -# include "kernel/textures.h" -# undef KERNEL_TEX +# include "kernel/data_arrays.h" +# undef KERNEL_DATA_ARRAY } void MetalDevice::global_alloc(device_memory &mem) @@ -800,9 +899,9 @@ void MetalDevice::tex_alloc(device_texture &mem) desc.textureType = MTLTextureType3D; desc.depth = mem.data_depth; - VLOG(2) << "Texture 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Texture 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; assert(mtlTexture); @@ -834,9 +933,9 @@ void MetalDevice::tex_alloc(device_texture &mem) desc.storageMode = storage_mode; desc.usage = MTLTextureUsageShaderRead; - VLOG(2) << "Texture 2D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; + VLOG_WORK << "Texture 2D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; mtlTexture = [mtlDevice newTextureWithDescriptor:desc]; assert(mtlTexture); diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 69b2a686ecc..11393f8b7e1 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -31,7 +31,7 @@ enum { enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM }; /* Pipeline State Object types */ -enum { +enum MetalPipelineType { /* A kernel that can be used with all scenes, supporting all features. * It is slow to compile, but only needs to be compiled once and is then * cached for future render sessions. This allows a render to get underway @@ -39,28 +39,33 @@ enum { */ PSO_GENERIC, - /* A kernel that is relatively quick to compile, but is specialized for the - * scene being rendered. It only contains the functionality and even baked in - * constants for values that means it needs to be recompiled whenever a - * dependent setting is changed. The render performance of this kernel is - * significantly faster though, and justifies the extra compile time. + /* A intersection kernel that is very quick to specialize and results in faster intersection + * kernel performance. It uses Metal function constants to replace several KernelData variables + * with fixed constants. + */ + PSO_SPECIALIZED_INTERSECT, + + /* A shading kernel that is slow to specialize, but results in faster shading kernel performance + * rendered. It uses Metal function constants to replace several KernelData variables with fixed + * constants and short-circuit all unused SVM node case handlers. */ - /* METAL_WIP: This isn't used and will require more changes to enable. */ - PSO_SPECIALISED, + PSO_SPECIALIZED_SHADE, PSO_NUM }; -const char *kernel_type_as_string(int kernel_type); +const char *kernel_type_as_string(MetalPipelineType pso_type); struct MetalKernelPipeline { void compile(); id<MTLLibrary> mtlLibrary = nil; - bool scene_specialized; + MetalPipelineType pso_type; string source_md5; + size_t usage_count = 0; + KernelData kernel_data_; bool use_metalrt; bool metalrt_hair; bool metalrt_hair_thick; @@ -75,6 +80,8 @@ struct MetalKernelPipeline { id<MTLComputePipelineState> pipeline = nil; int num_threads_per_block = 0; + bool should_use_binary_archive() const; + string error_str; API_AVAILABLE(macos(11.0)) @@ -85,7 +92,8 @@ struct MetalKernelPipeline { /* Cache of Metal kernels for each DeviceKernel. */ namespace MetalDeviceKernels { -bool load(MetalDevice *device, bool scene_specialized); +bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type); +bool load(MetalDevice *device, MetalPipelineType pso_type); const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel); } /* namespace MetalDeviceKernels */ diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index fec4cd80466..385cb412b06 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -5,6 +5,7 @@ # include "device/metal/kernel.h" # include "device/metal/device_impl.h" +# include "kernel/device/metal/function_constants.h" # include "util/md5.h" # include "util/path.h" # include "util/tbb.h" @@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN /* limit to 2 MTLCompiler instances */ int max_mtlcompiler_threads = 2; -const char *kernel_type_as_string(int kernel_type) +const char *kernel_type_as_string(MetalPipelineType pso_type) { - switch (kernel_type) { + switch (pso_type) { case PSO_GENERIC: return "PSO_GENERIC"; - case PSO_SPECIALISED: - return "PSO_SPECIALISED"; + case PSO_SPECIALIZED_INTERSECT: + return "PSO_SPECIALIZED_INTERSECT"; + case PSO_SPECIALIZED_SHADE: + return "PSO_SPECIALIZED_SHADE"; default: assert(0); } @@ -50,7 +53,11 @@ struct ShaderCache { /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by * device. */ - void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized); + void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type); + + bool should_load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type); void wait_for_all(); @@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index) } } -void ShaderCache::load_kernel(DeviceKernel device_kernel, - MetalDevice *device, - bool scene_specialized) +bool ShaderCache::should_load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type) { - { - /* create compiler threads on first run */ - thread_scoped_lock lock(cache_mutex); - if (compile_threads.empty()) { - running = true; - for (int i = 0; i < max_mtlcompiler_threads; i++) { - compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); - } - } + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + /* Skip megakernel. */ + return false; } - if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - /* skip megakernel */ - return; + if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) { + /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */ + return false; + } } - if (scene_specialized) { + if (pso_type != PSO_GENERIC) { /* Only specialize kernels where it can make an impact. */ if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - return; + return false; + } + + /* Only specialize shading / intersection kernels as requested. */ + bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND); + bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE); + if (is_shade_pso != is_shade_kernel) { + return false; } } @@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, /* check whether the kernel has already been requested / cached */ thread_scoped_lock lock(cache_mutex); for (auto &pipeline : pipelines[device_kernel]) { - if (scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { - /* we already requested a pipeline that is specialized for this kernel data */ - metal_printf("Specialized kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } + if (pipeline->source_md5 == device->source_md5[pso_type]) { + return false; } - else { - if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) { - /* we already requested a generic pipeline for this kernel */ - metal_printf("Generic kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } + } + } + + return true; +} + +void ShaderCache::load_kernel(DeviceKernel device_kernel, + MetalDevice *device, + MetalPipelineType pso_type) +{ + { + /* create compiler threads on first run */ + thread_scoped_lock lock(cache_mutex); + if (compile_threads.empty()) { + running = true; + for (int i = 0; i < max_mtlcompiler_threads; i++) { + compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); } } } + if (!should_load_kernel(device_kernel, device, pso_type)) { + return; + } + incomplete_requests++; PipelineRequest request; request.pipeline = new MetalKernelPipeline; - request.pipeline->scene_specialized = scene_specialized; + memcpy(&request.pipeline->kernel_data_, + &device->launch_params.data, + sizeof(request.pipeline->kernel_data_)); + request.pipeline->pso_type = pso_type; request.pipeline->mtlDevice = mtlDevice; - request.pipeline->source_md5 = - device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; - request.pipeline->mtlLibrary = - device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; + request.pipeline->source_md5 = device->source_md5[pso_type]; + request.pipeline->mtlLibrary = device->mtlLibrary[pso_type]; request.pipeline->device_kernel = device_kernel; request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; @@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel, { thread_scoped_lock lock(cache_mutex); - pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline)); + auto &collection = pipelines[device_kernel]; + + /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */ + int max_entries_of_same_pso_type = 3; + for (int i = (int)collection.size() - 1; i >= 0; i--) { + if (collection[i]->pso_type == pso_type) { + max_entries_of_same_pso_type -= 1; + if (max_entries_of_same_pso_type == 0) { + metal_printf("Purging oldest %s:%s kernel from ShaderCache\n", + kernel_type_as_string(pso_type), + device_kernel_as_string(device_kernel)); + collection.erase(collection.begin() + i); + break; + } + } + } + + collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline)); request_queue.push_back(request); } cond_var.notify_one(); @@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M continue; } - if (pipeline->scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { + if (pipeline->pso_type != PSO_GENERIC) { + if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] || + pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) { best_pipeline = pipeline.get(); } } @@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M } } + if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) { + metal_printf("Swapping in %s version of %s\n", + kernel_type_as_string(best_pipeline->pso_type), + device_kernel_as_string(kernel)); + } + best_pipeline->usage_count += 1; + return best_pipeline; } -void MetalKernelPipeline::compile() +bool MetalKernelPipeline::should_use_binary_archive() const { - int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC; + if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + if (atoi(str) != 0) { + /* Don't archive if we have opted out by env var. */ + return false; + } + } + + if (pso_type == PSO_GENERIC) { + /* Archive the generic kernels. */ + return true; + } + + if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND && + device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) { + /* Archive all shade kernels - they take a long time to compile. */ + return true; + } + + /* The remaining kernels are all fast to compile. They may get cached by the system shader cache, + * but will be quick to regenerate if not. */ + return false; +} + +static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr) +{ + MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new]; + + MTLDataType MTLDataType_int = MTLDataTypeInt; + MTLDataType MTLDataType_float = MTLDataTypeFloat; + MTLDataType MTLDataType_float4 = MTLDataTypeFloat4; + KernelData zero_data = {0}; + if (!data) { + data = &zero_data; + } +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + [constant_values setConstantValue:&data->parent.name \ + type:MTLDataType_##_type \ + atIndex:KernelData_##parent##_##name]; + +# include "kernel/data_template.h" + + return constant_values; +} + +void MetalKernelPipeline::compile() +{ const std::string function_name = std::string("cycles_metal_") + device_kernel_as_string(device_kernel); @@ -281,6 +371,17 @@ void MetalKernelPipeline::compile() if (@available(macOS 11.0, *)) { MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; func_desc.name = entryPoint; + + if (pso_type == PSO_SPECIALIZED_SHADE) { + func_desc.constantValues = GetConstantValues(&kernel_data_); + } + else if (pso_type == PSO_SPECIALIZED_INTERSECT) { + func_desc.constantValues = GetConstantValues(&kernel_data_); + } + else { + func_desc.constantValues = GetConstantValues(); + } + function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error]; } @@ -427,10 +528,7 @@ void MetalKernelPipeline::compile() MTLPipelineOption pipelineOptions = MTLPipelineOptionNone; - bool use_binary_archive = true; - if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { - use_binary_archive = (atoi(str) == 0); - } + bool use_binary_archive = should_use_binary_archive(); id<MTLBinaryArchive> archive = nil; string metalbin_path; @@ -608,19 +706,32 @@ void MetalKernelPipeline::compile() } } -bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized) +bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type) { + const double starttime = time_dt(); auto shader_cache = get_shader_cache(device->mtlDevice); for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { - shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized); + shader_cache->load_kernel((DeviceKernel)i, device, pso_type); } - if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) { - shader_cache->wait_for_all(); - } + shader_cache->wait_for_all(); + metal_printf("Back-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); return true; } +bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type) +{ + auto shader_cache = get_shader_cache(device->mtlDevice); + for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { + if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) { + return true; + } + } + return false; +} + const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device, DeviceKernel kernel) { diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h index b0bd487c86d..fc32740f3e1 100644 --- a/intern/cycles/device/metal/queue.h +++ b/intern/cycles/device/metal/queue.h @@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue { virtual int num_concurrent_states(const size_t) const override; virtual int num_concurrent_busy_states() const override; + virtual int num_sort_partition_elements() const override; virtual void init_execution() override; diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 0e260886abb..5ac63a16c61 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -293,6 +293,11 @@ int MetalDeviceQueue::num_concurrent_busy_states() const return result; } +int MetalDeviceQueue::num_sort_partition_elements() const +{ + return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice); +} + void MetalDeviceQueue::init_execution() { /* Synchronize all textures and memory copies before executing task. */ @@ -311,8 +316,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, return false; } - VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " - << work_size; + VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " + << work_size; id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); @@ -358,8 +363,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */ /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */ - size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) + - sizeof(IntegratorStateGPU); + size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) + + offsetof(IntegratorStateGPU, sort_partition_divisor); size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset; memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset, (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset, @@ -415,8 +420,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */ - const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) + - sizeof(IntegratorStateGPU); + const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) + + offsetof(IntegratorStateGPU, sort_partition_divisor); for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) { int pointer_index = int(offset / sizeof(device_ptr)); MetalDevice::MetalMem *mmem = *( @@ -550,7 +555,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, /* Enhanced command buffer errors are only available in 11.0+ */ if (@available(macos 11.0, *)) { if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) { - printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]); NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo valueForKey:MTLCommandBufferEncoderInfoErrorKey]; if (encoderInfos != nil) { @@ -564,7 +569,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } } else if (command_buffer.error) { - printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]); + metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]); } } }]; diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h index f728967835d..a988d01d361 100644 --- a/intern/cycles/device/metal/util.h +++ b/intern/cycles/device/metal/util.h @@ -25,10 +25,20 @@ enum MetalGPUVendor { METAL_GPU_INTEL = 3, }; +enum AppleGPUArchitecture { + APPLE_UNKNOWN, + APPLE_M1, + APPLE_M2, +}; + /* Contains static Metal helper functions. */ struct MetalInfo { static vector<id<MTLDevice>> const &get_usable_devices(); - static MetalGPUVendor get_vendor_from_device_name(string const &device_name); + static int get_apple_gpu_core_count(id<MTLDevice> device); + static MetalGPUVendor get_device_vendor(id<MTLDevice> device); + static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device); + static int optimal_sort_partition_elements(id<MTLDevice> device); + static string get_device_name(id<MTLDevice> device); }; /* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */ diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm index a6bd593bcb6..65c67c400fe 100644 --- a/intern/cycles/device/metal/util.mm +++ b/intern/cycles/device/metal/util.mm @@ -10,26 +10,83 @@ # include "util/string.h" # include "util/time.h" +# include <IOKit/IOKitLib.h> # include <pwd.h> # include <sys/shm.h> # include <time.h> CCL_NAMESPACE_BEGIN -MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name) +string MetalInfo::get_device_name(id<MTLDevice> device) { - if (device_name.find("Intel") != string::npos) { + string device_name = [device.name UTF8String]; + if (get_device_vendor(device) == METAL_GPU_APPLE) { + /* Append the GPU core count so we can distinguish between GPU variants in benchmarks. */ + int gpu_core_count = get_apple_gpu_core_count(device); + device_name += string_printf(gpu_core_count ? " (GPU - %d cores)" : " (GPU)", gpu_core_count); + } + return device_name; +} + +int MetalInfo::get_apple_gpu_core_count(id<MTLDevice> device) +{ + int core_count = 0; + if (@available(macos 12.0, *)) { + io_service_t gpu_service = IOServiceGetMatchingService( + kIOMainPortDefault, IORegistryEntryIDMatching(device.registryID)); + if (CFNumberRef numberRef = (CFNumberRef)IORegistryEntryCreateCFProperty( + gpu_service, CFSTR("gpu-core-count"), 0, 0)) { + if (CFGetTypeID(numberRef) == CFNumberGetTypeID()) { + CFNumberGetValue(numberRef, kCFNumberSInt32Type, &core_count); + } + CFRelease(numberRef); + } + } + return core_count; +} + +AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device) +{ + const char *device_name = [device.name UTF8String]; + if (strstr(device_name, "M1")) { + return APPLE_M1; + } + else if (strstr(device_name, "M2")) { + return APPLE_M2; + } + return APPLE_UNKNOWN; +} + +MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device) +{ + const char *device_name = [device.name UTF8String]; + if (strstr(device_name, "Intel")) { return METAL_GPU_INTEL; } - else if (device_name.find("AMD") != string::npos) { + else if (strstr(device_name, "AMD")) { return METAL_GPU_AMD; } - else if (device_name.find("Apple") != string::npos) { + else if (strstr(device_name, "Apple")) { return METAL_GPU_APPLE; } return METAL_GPU_UNKNOWN; } +int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device) +{ + if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) { + return atoi(str); + } + + /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before + * sorting each partition by material. Partitioning into chunks of 65536 elements results in an + * overall render time speedup of up to 15%. */ + if (get_device_vendor(device) == METAL_GPU_APPLE) { + return 65536; + } + return 0; +} + vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() { static vector<id<MTLDevice>> usable_devices; @@ -41,9 +98,8 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() metal_printf("Usable Metal devices:\n"); for (id<MTLDevice> device in MTLCopyAllDevices()) { - const char *device_name = [device.name UTF8String]; - - MetalGPUVendor vendor = get_vendor_from_device_name(device_name); + string device_name = get_device_name(device); + MetalGPUVendor vendor = get_device_vendor(device); bool usable = false; if (@available(macos 12.2, *)) { @@ -55,12 +111,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices() } if (usable) { - metal_printf("- %s\n", device_name); + metal_printf("- %s\n", device_name.c_str()); [device retain]; usable_devices.push_back(device); } else { - metal_printf(" (skipping \"%s\")\n", device_name); + metal_printf(" (skipping \"%s\")\n", device_name.c_str()); } } if (usable_devices.empty()) { |