diff options
Diffstat (limited to 'intern/cycles/device/metal/device_impl.mm')
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 213 |
1 files changed, 156 insertions, 57 deletions
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index a0ac677beda..d1250b83d22 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -6,9 +6,12 @@ # include "device/metal/device_impl.h" # include "device/metal/device.h" +# include "scene/scene.h" + # include "util/debug.h" # include "util/md5.h" # include "util/path.h" +# include "util/time.h" CCL_NAMESPACE_BEGIN @@ -35,7 +38,7 @@ void MetalDevice::set_error(const string &error) } MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) + : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) { mtlDevId = info.num; @@ -43,10 +46,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile auto usable_devices = MetalInfo::get_usable_devices(); assert(mtlDevId < usable_devices.size()); mtlDevice = usable_devices[mtlDevId]; - device_name = [mtlDevice.name UTF8String]; - device_vendor = MetalInfo::get_vendor_from_device_name(device_name); + device_vendor = MetalInfo::get_device_vendor(mtlDevice); assert(device_vendor != METAL_GPU_UNKNOWN); - metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str()); + metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str()); /* determine default storage mode based on whether UMA is supported */ @@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile case METAL_GPU_APPLE: { max_threads_per_threadgroup = 512; use_metalrt = info.use_metalrt; + + /* Specialize the intersection kernels on Apple GPUs by default as these can be built very + * quickly. */ + kernel_specialization_level = PSO_SPECIALIZED_INTERSECT; break; } } @@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile capture_enabled = true; } + if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) { + kernel_specialization_level = (MetalPipelineType)atoi(envstr); + } + metal_printf("kernel_specialization_level = %s\n", + kernel_type_as_string( + (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1))); + MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init]; arg_desc_params.dataType = MTLDataTypePointer; arg_desc_params.access = MTLArgumentAccessReadOnly; @@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation() return DebugFlags().metal.adaptive_compile; } -string MetalDevice::get_source(const uint kernel_features) +void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features) { - string build_options; - + string global_defines; if (use_adaptive_compilation()) { - build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n"; } if (use_metalrt) { - build_options += "-D__METALRT__ "; + global_defines += "#define __METALRT__\n"; if (motion_blur) { - build_options += "-D__METALRT_MOTION__ "; + global_defines += "#define __METALRT_MOTION__\n"; } } # ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; + global_defines += "#define __KERNEL_DEBUG__\n"; # endif switch (device_vendor) { default: break; case METAL_GPU_INTEL: - build_options += "-D__KERNEL_METAL_INTEL__ "; + global_defines += "#define __KERNEL_METAL_INTEL__\n"; break; case METAL_GPU_AMD: - build_options += "-D__KERNEL_METAL_AMD__ "; + global_defines += "#define __KERNEL_METAL_AMD__\n"; break; case METAL_GPU_APPLE: - build_options += "-D__KERNEL_METAL_APPLE__ "; + global_defines += "#define __KERNEL_METAL_APPLE__\n"; break; } - /* reformat -D defines list into compilable form */ - vector<string> components; - string_replace(build_options, "-D", ""); - string_split(components, build_options, " "); + string &source = this->source[pso_type]; + source = "\n#include \"kernel/device/metal/kernel.metal\"\n"; + source = path_source_replace_includes(source, path_get("source")); - string globalDefines; - for (const string &component : components) { - vector<string> assignments; - string_split(assignments, component, "="); - if (assignments.size() == 2) - globalDefines += string_printf( - "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str()); - else - globalDefines += string_printf("#define %s\n", assignments[0].c_str()); + /* Perform any required specialization on the source. + * With Metal function constants we can generate a single variant of the kernel source which can + * be repeatedly respecialized. + */ + string baked_constants; + + /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of + * the same character length. Build a string of all active constant values which is then hashed + * in order to identify the PSO. + */ + if (pso_type != PSO_GENERIC) { + const double starttime = time_dt(); + +# define KERNEL_STRUCT_BEGIN(name, parent) \ + string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_"); + + /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */ +# define KERNEL_STRUCT_MEMBER(parent, _type, name) \ + baked_constants += string(#parent "." #name "=") + \ + to_string(_type(launch_params.data.parent.name)) + "\n"; + +# include "kernel/data_template.h" + + /* Opt in to all of available specializations. This can be made more granular for the + * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests, + * but the overhead should be negligible as these are very quick to (re)build and aren't + * serialized to disk via MTLBinaryArchives. + */ + global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n"; + + metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0); } - string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n"; - source = path_source_replace_includes(source, path_get("source")); - - metal_printf("Global defines:\n%s\n", globalDefines.c_str()); + source = global_defines + source; + metal_printf("================\n%s================\n\%s================\n", + global_defines.c_str(), + baked_constants.c_str()); - return source; + /* Generate an MD5 from the source and include any baked constants. This is used when caching + * PSOs. */ + MD5Hash md5; + md5.append(baked_constants); + md5.append(source); + source_md5[pso_type] = md5.get_hex(); } bool MetalDevice::load_kernels(const uint _kernel_features) @@ -279,24 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - source[PSO_GENERIC] = get_source(kernel_features); - mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - - MD5Hash md5; - md5.append(source[PSO_GENERIC]); - source_md5[PSO_GENERIC] = md5.get_hex(); - - metal_printf("Front-end compilation finished (generic)\n"); - - bool result = MetalDeviceKernels::load(this, false); + bool result = compile_and_load(PSO_GENERIC); reserve_local_memory(kernel_features); - return result; } -id<MTLLibrary> MetalDevice::compile(string const &source) +bool MetalDevice::compile_and_load(MetalPipelineType pso_type) { + make_source(pso_type, kernel_features); + + if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) { + /* We already have a full set of matching pipelines which are cached or queued. */ + metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type)); + return true; + } + MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; @@ -304,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source) options.languageVersion = MTLLanguageVersion2_4; } + if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) { + path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))), + source[pso_type]); + } + + const double starttime = time_dt(); + NSError *error = NULL; - id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) - options:options - error:&error]; + mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str()) + options:options + error:&error]; - if (!mtlLibrary) { + if (!mtlLibrary[pso_type]) { NSString *err = [error localizedDescription]; set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); } + metal_printf("Front-end compilation finished in %.1f seconds (%s)\n", + time_dt() - starttime, + kernel_type_as_string(pso_type)); + [options release]; - return mtlLibrary; + return MetalDeviceKernels::load(this, pso_type); } void MetalDevice::reserve_local_memory(const uint kernel_features) @@ -623,11 +670,63 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz return 0; } +void MetalDevice::optimize_for_scene(Scene *scene) +{ + MetalPipelineType specialization_level = kernel_specialization_level; + + if (specialization_level < PSO_SPECIALIZED_INTERSECT) { + return; + } + + /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them + * synchronously. */ + compile_and_load(PSO_SPECIALIZED_INTERSECT); + + if (specialization_level < PSO_SPECIALIZED_SHADE) { + return; + } + if (!scene->params.background) { + /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to + * build. */ + return; + } + + /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and + * only if there isn't an existing load in flight. + */ + auto specialize_shade_fn = ^() { + compile_and_load(PSO_SPECIALIZED_SHADE); + async_compile_and_load = false; + }; + + bool async_specialize_shade = true; + + /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */ + if (getenv("CYCLES_METAL_PROFILING") != nullptr) { + async_specialize_shade = false; + } + + if (async_specialize_shade) { + if (!async_compile_and_load) { + async_compile_and_load = true; + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), + specialize_shade_fn); + } + else { + metal_printf( + "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n"); + } + } + else { + specialize_shade_fn(); + } +} + void MetalDevice::const_copy_to(const char *name, void *host, size_t size) { - if (strcmp(name, "__data") == 0) { + if (strcmp(name, "data") == 0) { assert(size == sizeof(KernelData)); - memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size); + memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData)); return; } @@ -646,19 +745,19 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size) }; /* Update data storage pointers in launch parameters. */ - if (strcmp(name, "__integrator_state") == 0) { + if (strcmp(name, "integrator_state") == 0) { /* IntegratorStateGPU is contiguous pointers */ - const size_t pointer_block_size = sizeof(IntegratorStateGPU); + const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor); update_launch_pointers( - offsetof(KernelParamsMetal, __integrator_state), host, size, pointer_block_size); + offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size); } -# define KERNEL_TEX(data_type, tex_name) \ +# define KERNEL_DATA_ARRAY(data_type, tex_name) \ else if (strcmp(name, #tex_name) == 0) \ { \ update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \ } -# include "kernel/textures.h" -# undef KERNEL_TEX +# include "kernel/data_arrays.h" +# undef KERNEL_DATA_ARRAY } void MetalDevice::global_alloc(device_memory &mem) |