diff options
author | Brecht Van Lommel <brecht@blender.org> | 2022-04-28 01:46:14 +0300 |
---|---|---|
committer | Brecht Van Lommel <brecht@blender.org> | 2022-04-28 01:46:43 +0300 |
commit | 52a5f68562680c0ccd6d4e525098bb5e2af7d0bd (patch) | |
tree | e370e4a7fca575aad8ff8e9f1b975b4fd98a9c04 | |
parent | 3558f565f1e8a8e5dc49067cc0500cbf993af69e (diff) |
Revert "Cycles: Enable inlining on Apple Silicon for 1.1x speedup"
This reverts commit b82de02e7ce857e20b842a074c0068b146a9fd79. It is causing
crashes in various regression tests.
Ref D14763
-rw-r--r-- | intern/cycles/device/metal/device_impl.h | 9 | ||||
-rw-r--r-- | intern/cycles/device/metal/device_impl.mm | 101 | ||||
-rw-r--r-- | intern/cycles/device/metal/kernel.h | 103 | ||||
-rw-r--r-- | intern/cycles/device/metal/kernel.mm | 849 | ||||
-rw-r--r-- | intern/cycles/device/metal/queue.mm | 9 | ||||
-rw-r--r-- | intern/cycles/kernel/device/metal/compat.h | 24 |
6 files changed, 539 insertions, 556 deletions
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h index d7311ee985f..27c58ce6d2f 100644 --- a/intern/cycles/device/metal/device_impl.h +++ b/intern/cycles/device/metal/device_impl.h @@ -28,8 +28,7 @@ class MetalDevice : public Device { id<MTLCommandQueue> mtlGeneralCommandQueue = nil; id<MTLArgumentEncoder> mtlAncillaryArgEncoder = nil; /* encoder used for fetching device pointers from MTLBuffers */ - string source[PSO_NUM]; - string source_md5[PSO_NUM]; + string source_used_for_compile[PSO_NUM]; KernelParamsMetal launch_params = {0}; @@ -111,12 +110,6 @@ class MetalDevice : public Device { virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override; - id<MTLLibrary> compile(string const &source); - - const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const; - - bool kernel_available(DeviceKernel kernel) const; - /* ------------------------------------------------------------------ */ /* low-level memory management */ diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm index 7d1212cb37c..c01f51fb506 100644 --- a/intern/cycles/device/metal/device_impl.mm +++ b/intern/cycles/device/metal/device_impl.mm @@ -275,44 +275,96 @@ bool MetalDevice::load_kernels(const uint _kernel_features) * active, but may still need to be rendered without motion blur if that isn't active as well. */ motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION; - source[PSO_GENERIC] = get_source(kernel_features); - mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]); - - MD5Hash md5; - md5.append(source[PSO_GENERIC]); - source_md5[PSO_GENERIC] = md5.get_hex(); - - metal_printf("Front-end compilation finished (generic)\n"); - - bool result = kernels.load(this, false); - - reserve_local_memory(kernel_features); + NSError *error = NULL; - return result; -} + for (int i = 0; i < PSO_NUM; i++) { + if (mtlLibrary[i]) { + [mtlLibrary[i] release]; + mtlLibrary[i] = nil; + } + } -id<MTLLibrary> MetalDevice::compile(string const &source) -{ MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; options.fastMathEnabled = YES; if (@available(macOS 12.0, *)) { options.languageVersion = MTLLanguageVersion2_4; } + else { + return false; + } - NSError *error = NULL; - id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str()) + string metalsrc; + + /* local helper: dump source to disk and return filepath */ + auto dump_source = [&](int kernel_type) -> string { + string &source = source_used_for_compile[kernel_type]; + string metalsrc = path_cache_get(path_join("kernels", + string_printf("%s.%s.metal", + kernel_type_as_string(kernel_type), + util_md5_string(source).c_str()))); + path_write_text(metalsrc, source); + return metalsrc; + }; + + /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor, + * then compile it into a MTLLibrary */ + auto fetch_and_compile_source = [&](int kernel_type) { + /* Record the source used to compile this library, for hash building later. */ + string &source = source_used_for_compile[kernel_type]; + + switch (kernel_type) { + case PSO_GENERIC: { + source = get_source(kernel_features); + break; + } + case PSO_SPECIALISED: { + /* PSO_SPECIALISED derives from PSO_GENERIC */ + string &generic_source = source_used_for_compile[PSO_GENERIC]; + if (generic_source.empty()) { + generic_source = get_source(kernel_features); + } + source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source; + break; + } + default: + assert(0); + } + + /* create MTLLibrary (front-end compilation) */ + mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str()) options:options error:&error]; - if (!mtlLibrary) { - NSString *err = [error localizedDescription]; - set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); + bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr); + + if (!mtlLibrary[kernel_type] || do_source_dump) { + string metalsrc = dump_source(kernel_type); + + if (!mtlLibrary[kernel_type]) { + NSString *err = [error localizedDescription]; + set_error(string_printf("Failed to compile library:\n%s", [err UTF8String])); + + return false; + } + } + return true; + }; + + fetch_and_compile_source(PSO_GENERIC); + + if (use_function_specialisation) { + fetch_and_compile_source(PSO_SPECIALISED); } + metal_printf("Front-end compilation finished\n"); + + bool result = kernels.load(this, PSO_GENERIC); + [options release]; + reserve_local_memory(kernel_features); - return mtlLibrary; + return result; } void MetalDevice::reserve_local_memory(const uint kernel_features) @@ -619,11 +671,6 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz return 0; } -const MetalKernelPipeline &MetalDevice::get_best_pipeline(DeviceKernel kernel) const -{ - return kernels.get_best_pipeline(this, kernel); -} - void MetalDevice::const_copy_to(const char *name, void *host, size_t size) { if (strcmp(name, "__data") == 0) { diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h index 7e398d1cf41..b12491d820d 100644 --- a/intern/cycles/device/metal/kernel.h +++ b/intern/cycles/device/metal/kernel.h @@ -54,41 +54,98 @@ enum { const char *kernel_type_as_string(int kernel_type); struct MetalKernelPipeline { + void release() + { + if (pipeline) { + [pipeline release]; + pipeline = nil; + if (@available(macOS 11.0, *)) { + for (int i = 0; i < METALRT_TABLE_NUM; i++) { + if (intersection_func_table[i]) { + [intersection_func_table[i] release]; + intersection_func_table[i] = nil; + } + } + } + } + if (function) { + [function release]; + function = nil; + } + if (@available(macOS 11.0, *)) { + for (int i = 0; i < METALRT_TABLE_NUM; i++) { + if (intersection_func_table[i]) { + [intersection_func_table[i] release]; + } + } + } + } - void compile(); - - id<MTLLibrary> mtlLibrary = nil; - bool scene_specialized; - string source_md5; - - bool use_metalrt; - bool metalrt_hair; - bool metalrt_hair_thick; - bool metalrt_pointcloud; - - int threads_per_threadgroup; - - DeviceKernel device_kernel; bool loaded = false; - id<MTLDevice> mtlDevice = nil; id<MTLFunction> function = nil; id<MTLComputePipelineState> pipeline = nil; - int num_threads_per_block = 0; - - string error_str; API_AVAILABLE(macos(11.0)) id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil}; - id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil}; +}; + +struct MetalKernelLoadDesc { + int pso_index = 0; + const char *function_name = nullptr; + int kernel_index = 0; + int threads_per_threadgroup = 0; + MTLFunctionConstantValues *constant_values = nullptr; + NSArray *linked_functions = nullptr; + + struct IntersectorFunctions { + NSArray *defaults; + NSArray *shadow; + NSArray *local; + NSArray *operator[](int index) const + { + if (index == METALRT_TABLE_DEFAULT) + return defaults; + if (index == METALRT_TABLE_SHADOW) + return shadow; + return local; + } + } intersector_functions = {nullptr}; +}; + +/* Metal kernel and associate occupancy information. */ +class MetalDeviceKernel { + public: + ~MetalDeviceKernel(); + + bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5); + + void mark_loaded(int pso_index) + { + pso[pso_index].loaded = true; + } + + int get_num_threads_per_block() const + { + return num_threads_per_block; + } + const MetalKernelPipeline &get_pso() const; + + double load_duration = 0.0; + + private: + MetalKernelPipeline pso[PSO_NUM]; + + int num_threads_per_block = 0; }; /* Cache of Metal kernels for each DeviceKernel. */ class MetalDeviceKernels { public: - bool load(MetalDevice *device, bool scene_specialized); - bool available(const MetalDevice *device, DeviceKernel kernel) const; - const MetalKernelPipeline &get_best_pipeline(const MetalDevice *device, - DeviceKernel kernel) const; + bool load(MetalDevice *device, int kernel_type); + bool available(DeviceKernel kernel) const; + const MetalDeviceKernel &get(DeviceKernel kernel) const; + + MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM]; id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}}; diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm index 44a5e23d00f..9555ca03c8e 100644 --- a/intern/cycles/device/metal/kernel.mm +++ b/intern/cycles/device/metal/kernel.mm @@ -9,7 +9,6 @@ # include "util/path.h" # include "util/tbb.h" # include "util/time.h" -# include "util/unique_ptr.h" CCL_NAMESPACE_BEGIN @@ -29,370 +28,82 @@ const char *kernel_type_as_string(int kernel_type) return ""; } -bool kernel_has_intersection(DeviceKernel device_kernel) +MetalDeviceKernel::~MetalDeviceKernel() { - return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || - device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || - device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || - device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE); -} - -struct ShaderCache { - ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice) - { - } - ~ShaderCache(); - - /* Get the fastest available pipeline for the specified kernel. */ - MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device); - - /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by - * device. */ - void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized); - - void wait_for_all(); - - private: - friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice); - - void compile_thread_func(int thread_index); - - using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>; - - struct PipelineRequest { - MetalKernelPipeline *pipeline = nullptr; - std::function<void(MetalKernelPipeline *)> completionHandler; - }; - - std::mutex cache_mutex; - - PipelineCollection pipelines[DEVICE_KERNEL_NUM]; - id<MTLDevice> mtlDevice; - - bool running = false; - std::condition_variable cond_var; - std::deque<PipelineRequest> request_queue; - std::vector<std::thread> compile_threads; - std::atomic_int incomplete_requests = 0; -}; - -std::mutex g_shaderCacheMutex; -std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache; - -ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice) -{ - thread_scoped_lock lock(g_shaderCacheMutex); - auto it = g_shaderCache.find(mtlDevice); - if (it != g_shaderCache.end()) { - return it->second.get(); - } - - g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice); - return g_shaderCache[mtlDevice].get(); -} - -ShaderCache::~ShaderCache() -{ - running = false; - cond_var.notify_all(); - for (auto &thread : compile_threads) { - thread.join(); - } -} - -void ShaderCache::wait_for_all() -{ - while (incomplete_requests > 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + for (int i = 0; i < PSO_NUM; i++) { + pso[i].release(); } } -void ShaderCache::compile_thread_func(int thread_index) +bool MetalDeviceKernel::load(MetalDevice *device, + MetalKernelLoadDesc const &desc_in, + MD5Hash const &md5) { - while (1) { - - /* wait for / acquire next request */ - PipelineRequest request; - { - thread_scoped_lock lock(cache_mutex); - cond_var.wait(lock, [&] { return !running || !request_queue.empty(); }); - if (!running) { - break; - } - - if (!request_queue.empty()) { - request = request_queue.front(); - request_queue.pop_front(); - } - } - - /* service request */ - if (request.pipeline) { - request.pipeline->compile(); - incomplete_requests--; - } - } -} - -void ShaderCache::load_kernel(DeviceKernel device_kernel, - MetalDevice *device, - bool scene_specialized) -{ - { - /* create compiler threads on first run */ - thread_scoped_lock lock(cache_mutex); - if (compile_threads.empty()) { - running = true; - for (int i = 0; i < max_mtlcompiler_threads; i++) { - compile_threads.push_back(std::thread([&] { compile_thread_func(i); })); - } - } - } - - if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + __block MetalKernelLoadDesc const desc(desc_in); + if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { /* skip megakernel */ - return; + return true; } - if (scene_specialized) { - /* Only specialize kernels where it can make an impact. */ - if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || - device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { - return; - } - } - - { - /* check whether the kernel has already been requested / cached */ - thread_scoped_lock lock(cache_mutex); - for (auto &pipeline : pipelines[device_kernel]) { - if (scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { - /* we already requested a pipeline that is specialised for this kernel data */ - metal_printf("Specialized kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } - } - else { - if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) { - /* we already requested a generic pipeline for this kernel */ - metal_printf("Generic kernel already requested (%s)\n", - device_kernel_as_string(device_kernel)); - return; - } - } - } - } - - incomplete_requests++; - - PipelineRequest request; - request.pipeline = new MetalKernelPipeline; - request.pipeline->scene_specialized = scene_specialized; - request.pipeline->mtlDevice = mtlDevice; - request.pipeline->source_md5 = - device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; - request.pipeline->mtlLibrary = - device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC]; - request.pipeline->device_kernel = device_kernel; - request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup; - - /* metalrt options */ - request.pipeline->use_metalrt = device->use_metalrt; - request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; - - { - thread_scoped_lock lock(cache_mutex); - pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline)); - request_queue.push_back(request); + bool use_binary_archive = true; + if (device->device_vendor == METAL_GPU_APPLE) { + /* Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render */ + use_binary_archive = false; } - cond_var.notify_one(); -} -MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device) -{ - thread_scoped_lock lock(cache_mutex); - auto &collection = pipelines[kernel]; - if (collection.empty()) { - return nullptr; + if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { + use_binary_archive = (atoi(str) == 0); } - /* metalrt options */ - bool use_metalrt = device->use_metalrt; - bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR; - bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK; - bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD; - - MetalKernelPipeline *best_pipeline = nullptr; - for (auto &pipeline : collection) { - if (!pipeline->loaded) { - /* still loading - ignore */ - continue; - } - - if (pipeline->use_metalrt != use_metalrt || pipeline->metalrt_hair != metalrt_hair || - pipeline->metalrt_hair_thick != metalrt_hair_thick || - pipeline->metalrt_pointcloud != metalrt_pointcloud) { - /* wrong combination of metalrt options */ - continue; - } + id<MTLBinaryArchive> archive = nil; + string metalbin_path; + if (use_binary_archive) { + NSProcessInfo *processInfo = [NSProcessInfo processInfo]; + string osVersion = [[processInfo operatingSystemVersionString] UTF8String]; + MD5Hash local_md5(md5); + local_md5.append(osVersion); + string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() + + to_string(desc.pso_index) + ".bin"; + metalbin_path = path_cache_get(path_join("kernels", metalbin_name)); + path_create_directories(metalbin_path); - if (pipeline->scene_specialized) { - if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) { - best_pipeline = pipeline.get(); + if (path_exists(metalbin_path) && use_binary_archive) { + if (@available(macOS 11.0, *)) { + MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; + archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())]; + archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; + [archiveDesc release]; } } - else if (!best_pipeline) { - best_pipeline = pipeline.get(); - } - } - - return best_pipeline; -} - -void MetalKernelPipeline::compile() -{ - int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC; - - const std::string function_name = std::string("cycles_metal_") + - device_kernel_as_string(device_kernel); - - int threads_per_threadgroup = this->threads_per_threadgroup; - if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && - device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) { - /* Always use 512 for the sorting kernels */ - threads_per_threadgroup = 512; } - NSString *entryPoint = [@(function_name.c_str()) copy]; + NSString *entryPoint = [@(desc.function_name) copy]; NSError *error = NULL; if (@available(macOS 11.0, *)) { MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; func_desc.name = entryPoint; - function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error]; + if (desc.constant_values) { + func_desc.constantValues = desc.constant_values; + } + pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index] + newFunctionWithDescriptor:func_desc + error:&error]; } - [entryPoint release]; - if (function == nil) { + if (pso[desc.pso_index].function == nil) { NSString *err = [error localizedDescription]; string errors = [err UTF8String]; - metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str()); - return; - } - function.label = [entryPoint copy]; - - if (use_metalrt) { - if (@available(macOS 11.0, *)) { - /* create the id<MTLFunction> for each intersection function */ - const char *function_names[] = { - "__anyhit__cycles_metalrt_visibility_test_tri", - "__anyhit__cycles_metalrt_visibility_test_box", - "__anyhit__cycles_metalrt_shadow_all_hit_tri", - "__anyhit__cycles_metalrt_shadow_all_hit_box", - "__anyhit__cycles_metalrt_local_hit_tri", - "__anyhit__cycles_metalrt_local_hit_box", - "__intersection__curve_ribbon", - "__intersection__curve_ribbon_shadow", - "__intersection__curve_all", - "__intersection__curve_all_shadow", - "__intersection__point", - "__intersection__point_shadow", - }; - assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM); - - MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; - for (int i = 0; i < METALRT_FUNC_NUM; i++) { - const char *function_name = function_names[i]; - desc.name = [@(function_name) copy]; - - NSError *error = NULL; - rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc error:&error]; - - if (rt_intersection_function[i] == nil) { - NSString *err = [error localizedDescription]; - string errors = [err UTF8String]; - - error_str = string_printf( - "Error getting intersection function \"%s\": %s", function_name, errors.c_str()); - break; - } - - rt_intersection_function[i].label = [@(function_name) copy]; - } - } + device->set_error( + string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str())); + return false; } - NSArray *table_functions[METALRT_TABLE_NUM] = {nil}; - NSArray *linked_functions = nil; - - if (use_metalrt) { - id<MTLFunction> curve_intersect_default = nil; - id<MTLFunction> curve_intersect_shadow = nil; - id<MTLFunction> point_intersect_default = nil; - id<MTLFunction> point_intersect_shadow = nil; - if (metalrt_hair) { - /* Add curve intersection programs. */ - if (metalrt_hair_thick) { - /* Slower programs for thick hair since that also slows down ribbons. - * Ideally this should not be needed. */ - curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL]; - curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW]; - } - else { - curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON]; - curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW]; - } - } - if (metalrt_pointcloud) { - point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT]; - point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW]; - } - table_functions[METALRT_TABLE_DEFAULT] = [NSArray - arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI], - curve_intersect_default ? - curve_intersect_default : - rt_intersection_function[METALRT_FUNC_DEFAULT_BOX], - point_intersect_default ? - point_intersect_default : - rt_intersection_function[METALRT_FUNC_DEFAULT_BOX], - nil]; - table_functions[METALRT_TABLE_SHADOW] = [NSArray - arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI], - curve_intersect_shadow ? - curve_intersect_shadow : - rt_intersection_function[METALRT_FUNC_SHADOW_BOX], - point_intersect_shadow ? - point_intersect_shadow : - rt_intersection_function[METALRT_FUNC_SHADOW_BOX], - nil]; - table_functions[METALRT_TABLE_LOCAL] = [NSArray - arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI], - rt_intersection_function[METALRT_FUNC_LOCAL_BOX], - rt_intersection_function[METALRT_FUNC_LOCAL_BOX], - nil]; - - NSMutableSet *unique_functions = [NSMutableSet - setWithArray:table_functions[METALRT_TABLE_DEFAULT]]; - [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]]; - [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]]; - - if (kernel_has_intersection(device_kernel)) { - linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]] - sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) { - return [f1.label compare:f2.label]; - }]; - } - unique_functions = nil; - } + pso[desc.pso_index].function.label = [@(desc.function_name) copy]; - MTLComputePipelineDescriptor *computePipelineStateDescriptor = + __block MTLComputePipelineDescriptor *computePipelineStateDescriptor = [[MTLComputePipelineDescriptor alloc] init]; computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable; @@ -400,86 +111,52 @@ void MetalKernelPipeline::compile() computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable; if (@available(macos 10.14, *)) { - computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup; + computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup; } computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true; - computePipelineStateDescriptor.computeFunction = function; - + computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function; if (@available(macOS 11.0, *)) { /* Attach the additional functions to an MTLLinkedFunctions object */ - if (linked_functions) { + if (desc.linked_functions) { computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init]; - computePipelineStateDescriptor.linkedFunctions.functions = linked_functions; + computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions; } + computePipelineStateDescriptor.maxCallStackDepth = 1; - if (use_metalrt) { - computePipelineStateDescriptor.maxCallStackDepth = 8; - } } + /* Create a new Compute pipeline state object */ MTLPipelineOption pipelineOptions = MTLPipelineOptionNone; - bool use_binary_archive = true; - if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) { - use_binary_archive = (atoi(str) == 0); - } - - id<MTLBinaryArchive> archive = nil; - string metalbin_path; - string metalbin_name; - if (use_binary_archive) { - NSProcessInfo *processInfo = [NSProcessInfo processInfo]; - string osVersion = [[processInfo operatingSystemVersionString] UTF8String]; - MD5Hash local_md5; - local_md5.append(source_md5); - local_md5.append(osVersion); - local_md5.append((uint8_t *)&this->threads_per_threadgroup, - sizeof(this->threads_per_threadgroup)); - - string options; - if (use_metalrt && kernel_has_intersection(device_kernel)) { - /* incorporate any MetalRT specialisations into the archive name */ - options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d", - metalrt_hair ? 1 : 0, - metalrt_hair_thick ? 1 : 0, - metalrt_pointcloud ? 1 : 0); - } - - /* Replace non-alphanumerical characters with underscores. */ - string device_name = [mtlDevice.name UTF8String]; - for (char &c : device_name) { - if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) { - c = '_'; - } - } - - metalbin_name = device_name; - metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel)); - metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type)); - metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin"); - - metalbin_path = path_cache_get(path_join("kernels", metalbin_name)); - path_create_directories(metalbin_path); - - if (path_exists(metalbin_path) && use_binary_archive) { - if (@available(macOS 11.0, *)) { - MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; - archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())]; - archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; - [archiveDesc release]; - } - } - } - - __block bool creating_new_archive = false; + bool creating_new_archive = false; if (@available(macOS 11.0, *)) { if (use_binary_archive) { if (!archive) { MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init]; archiveDesc.url = nil; - archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; + archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil]; creating_new_archive = true; + + double starttime = time_dt(); + + if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor + error:&error]) { + NSString *errStr = [error localizedDescription]; + metal_printf("Failed to add PSO to archive:\n%s\n", + errStr ? [errStr UTF8String] : "nil"); + } + else { + double duration = time_dt() - starttime; + metal_printf("%2d | %-55s | %7.2fs\n", + desc.kernel_index, + device_kernel_as_string((DeviceKernel)desc.kernel_index), + duration); + + if (desc.pso_index == PSO_GENERIC) { + this->load_duration = duration; + } + } } computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil]; pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss; @@ -493,14 +170,17 @@ void MetalKernelPipeline::compile() MTLComputePipelineReflection *reflection, NSError *error) { bool recreate_archive = false; - if (computePipelineState == nil && archive) { + if (computePipelineState == nil && archive && !creating_new_archive) { + + assert(0); + NSString *errStr = [error localizedDescription]; metal_printf( "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... " "(error: %s)\n", - device_kernel_as_string((DeviceKernel)device_kernel), + device_kernel_as_string((DeviceKernel)desc.kernel_index), errStr ? [errStr UTF8String] : "nil"); - computePipelineState = [mtlDevice + computePipelineState = [device->mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor options:MTLPipelineOptionNone reflection:nullptr @@ -512,23 +192,32 @@ void MetalKernelPipeline::compile() if (computePipelineState == nil) { NSString *errStr = [error localizedDescription]; - error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n", - device_kernel_as_string((DeviceKernel)device_kernel)); - error_str += (errStr ? [errStr UTF8String] : "nil"); - metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n", - kernel_type_as_string(pso_type), - device_kernel, - device_kernel_as_string((DeviceKernel)device_kernel), + device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n", + device_kernel_as_string((DeviceKernel)desc.kernel_index)) + + (errStr ? [errStr UTF8String] : "nil")); + metal_printf("%2d | %-55s | %7.2fs | FAILED!\n", + desc.kernel_index, + device_kernel_as_string((DeviceKernel)desc.kernel_index), duration); return; } - int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, - computePipelineState.threadExecutionWidth); + pso[desc.pso_index].pipeline = computePipelineState; + num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup, + computePipelineState.threadExecutionWidth); num_threads_per_block = std::max(num_threads_per_block, (int)computePipelineState.threadExecutionWidth); - this->pipeline = computePipelineState; - this->num_threads_per_block = num_threads_per_block; + + if (!use_binary_archive) { + metal_printf("%2d | %-55s | %7.2fs\n", + desc.kernel_index, + device_kernel_as_string((DeviceKernel)desc.kernel_index), + duration); + + if (desc.pso_index == PSO_GENERIC) { + this->load_duration = duration; + } + } if (@available(macOS 11.0, *)) { if (creating_new_archive || recreate_archive) { @@ -539,90 +228,304 @@ void MetalKernelPipeline::compile() } } } - }; - /* Block on load to ensure we continue with a valid kernel function */ - if (creating_new_archive) { - starttime = time_dt(); - NSError *error; - if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor - error:&error]) { - NSString *errStr = [error localizedDescription]; - metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil"); - } - } - id<MTLComputePipelineState> pipeline = [mtlDevice - newComputePipelineStateWithDescriptor:computePipelineStateDescriptor - options:pipelineOptions - reflection:nullptr - error:&error]; - completionHandler(pipeline, nullptr, error); - - this->loaded = true; - [computePipelineStateDescriptor release]; - computePipelineStateDescriptor = nil; - - if (use_metalrt && linked_functions) { - for (int table = 0; table < METALRT_TABLE_NUM; table++) { - if (@available(macOS 11.0, *)) { - MTLIntersectionFunctionTableDescriptor *ift_desc = - [[MTLIntersectionFunctionTableDescriptor alloc] init]; - ift_desc.functionCount = table_functions[table].count; - intersection_func_table[table] = [this->pipeline - newIntersectionFunctionTableWithDescriptor:ift_desc]; - - /* Finally write the function handles into this pipeline's table */ - for (int i = 0; i < 2; i++) { - id<MTLFunctionHandle> handle = [pipeline - functionHandleWithFunction:table_functions[table][i]]; - [intersection_func_table[table] setFunction:handle atIndex:i]; + [computePipelineStateDescriptor release]; + computePipelineStateDescriptor = nil; + + if (device->use_metalrt && desc.linked_functions) { + for (int table = 0; table < METALRT_TABLE_NUM; table++) { + if (@available(macOS 11.0, *)) { + MTLIntersectionFunctionTableDescriptor *ift_desc = + [[MTLIntersectionFunctionTableDescriptor alloc] init]; + ift_desc.functionCount = desc.intersector_functions[table].count; + + pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline + newIntersectionFunctionTableWithDescriptor:ift_desc]; + + /* Finally write the function handles into this pipeline's table */ + for (int i = 0; i < 2; i++) { + id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline + functionHandleWithFunction:desc.intersector_functions[table][i]]; + [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i]; + } } } } - } - double duration = time_dt() - starttime; + mark_loaded(desc.pso_index); + }; - if (!use_binary_archive) { - metal_printf("%16s | %2d | %-55s | %7.2fs\n", - kernel_type_as_string(pso_type), - int(device_kernel), - device_kernel_as_string(device_kernel), - duration); + if (desc.pso_index == PSO_SPECIALISED) { + /* Asynchronous load */ + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + NSError *error; + id<MTLComputePipelineState> pipeline = [device->mtlDevice + newComputePipelineStateWithDescriptor:computePipelineStateDescriptor + options:pipelineOptions + reflection:nullptr + error:&error]; + completionHandler(pipeline, nullptr, error); + }); } else { - metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n", - kernel_type_as_string(pso_type), - device_kernel, - device_kernel_as_string((DeviceKernel)device_kernel), - duration, - creating_new_archive ? " new" : "load", - metalbin_name.c_str()); + /* Block on load to ensure we continue with a valid kernel function */ + id<MTLComputePipelineState> pipeline = [device->mtlDevice + newComputePipelineStateWithDescriptor:computePipelineStateDescriptor + options:pipelineOptions + reflection:nullptr + error:&error]; + completionHandler(pipeline, nullptr, error); } + + return true; } -bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized) +const MetalKernelPipeline &MetalDeviceKernel::get_pso() const { - auto shader_cache = get_shader_cache(device->mtlDevice); - for (int i = 0; i < DEVICE_KERNEL_NUM; i++) { - shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized); + if (pso[PSO_SPECIALISED].loaded) { + return pso[PSO_SPECIALISED]; } - if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) { - shader_cache->wait_for_all(); + assert(pso[PSO_GENERIC].loaded); + return pso[PSO_GENERIC]; +} + +bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type) +{ + bool any_error = false; + + MD5Hash md5; + + /* Build the function constant table */ + MTLFunctionConstantValues *constant_values = nullptr; + if (kernel_type == PSO_SPECIALISED) { + constant_values = [MTLFunctionConstantValues new]; + +# define KERNEL_FILM(_type, name) \ + [constant_values setConstantValue:&data.film.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_film_##name]; \ + md5.append((uint8_t *)&data.film.name, sizeof(data.film.name)); + +# define KERNEL_BACKGROUND(_type, name) \ + [constant_values setConstantValue:&data.background.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_background_##name]; \ + md5.append((uint8_t *)&data.background.name, sizeof(data.background.name)); + +# define KERNEL_INTEGRATOR(_type, name) \ + [constant_values setConstantValue:&data.integrator.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_integrator_##name]; \ + md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name)); + +# define KERNEL_BVH(_type, name) \ + [constant_values setConstantValue:&data.bvh.name \ + type:get_MTLDataType_##_type() \ + atIndex:KernelData_bvh_##name]; \ + md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name)); + + /* METAL_WIP: populate constant_values based on KernelData */ + assert(0); + /* + const KernelData &data = device->launch_params.data; + # include "kernel/types/background.h" + # include "kernel/types/bvh.h" + # include "kernel/types/film.h" + # include "kernel/types/integrator.h" + */ } - return true; + + if (device->use_metalrt) { + if (@available(macOS 11.0, *)) { + /* create the id<MTLFunction> for each intersection function */ + const char *function_names[] = { + "__anyhit__cycles_metalrt_visibility_test_tri", + "__anyhit__cycles_metalrt_visibility_test_box", + "__anyhit__cycles_metalrt_shadow_all_hit_tri", + "__anyhit__cycles_metalrt_shadow_all_hit_box", + "__anyhit__cycles_metalrt_local_hit_tri", + "__anyhit__cycles_metalrt_local_hit_box", + "__intersection__curve_ribbon", + "__intersection__curve_ribbon_shadow", + "__intersection__curve_all", + "__intersection__curve_all_shadow", + "__intersection__point", + "__intersection__point_shadow", + }; + assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM); + + MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor]; + if (kernel_type == PSO_SPECIALISED) { + desc.constantValues = constant_values; + } + for (int i = 0; i < METALRT_FUNC_NUM; i++) { + const char *function_name = function_names[i]; + desc.name = [@(function_name) copy]; + + NSError *error = NULL; + rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type] + newFunctionWithDescriptor:desc + error:&error]; + + if (rt_intersection_funcs[kernel_type][i] == nil) { + NSString *err = [error localizedDescription]; + string errors = [err UTF8String]; + + device->set_error(string_printf( + "Error getting intersection function \"%s\": %s", function_name, errors.c_str())); + any_error = true; + break; + } + + rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy]; + } + } + } + md5.append(device->source_used_for_compile[kernel_type]); + + string hash = md5.get_hex(); + if (loaded_md5[kernel_type] == hash) { + return true; + } + + if (!any_error) { + NSArray *table_functions[METALRT_TABLE_NUM] = {nil}; + NSArray *function_list = nil; + + if (device->use_metalrt) { + id<MTLFunction> curve_intersect_default = nil; + id<MTLFunction> curve_intersect_shadow = nil; + id<MTLFunction> point_intersect_default = nil; + id<MTLFunction> point_intersect_shadow = nil; + if (device->kernel_features & KERNEL_FEATURE_HAIR) { + /* Add curve intersection programs. */ + if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) { + /* Slower programs for thick hair since that also slows down ribbons. + * Ideally this should not be needed. */ + curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL]; + curve_intersect_shadow = + rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW]; + } + else { + curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON]; + curve_intersect_shadow = + rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW]; + } + } + if (device->kernel_features & KERNEL_FEATURE_POINTCLOUD) { + point_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT]; + point_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT_SHADOW]; + } + table_functions[METALRT_TABLE_DEFAULT] = [NSArray + arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI], + curve_intersect_default ? + curve_intersect_default : + rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX], + point_intersect_default ? + point_intersect_default : + rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX], + nil]; + table_functions[METALRT_TABLE_SHADOW] = [NSArray + arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI], + curve_intersect_shadow ? + curve_intersect_shadow : + rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX], + point_intersect_shadow ? + point_intersect_shadow : + rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX], + nil]; + table_functions[METALRT_TABLE_LOCAL] = [NSArray + arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI], + rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX], + rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX], + nil]; + + NSMutableSet *unique_functions = [NSMutableSet + setWithArray:table_functions[METALRT_TABLE_DEFAULT]]; + [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]]; + [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]]; + + function_list = [[NSArray arrayWithArray:[unique_functions allObjects]] + sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) { + return [f1.label compare:f2.label]; + }]; + + unique_functions = nil; + } + + metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n", + kernel_type_as_string(kernel_type)); + + tbb::task_arena local_arena(max_mtlcompiler_threads); + local_arena.execute([&]() { + parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) { + /* skip megakernel */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + return; + } + + /* Only specialize kernels where it can make an impact. */ + if (kernel_type == PSO_SPECIALISED) { + if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + return; + } + } + + MetalDeviceKernel &kernel = kernels_[i]; + + const std::string function_name = std::string("cycles_metal_") + + device_kernel_as_string((DeviceKernel)i); + int threads_per_threadgroup = device->max_threads_per_threadgroup; + if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) { + /* Always use 512 for the sorting kernels */ + threads_per_threadgroup = 512; + } + + NSArray *kernel_function_list = nil; + + if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || + i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW || + i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE || + i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK || + i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) { + kernel_function_list = function_list; + } + + MetalKernelLoadDesc desc; + desc.pso_index = kernel_type; + desc.kernel_index = i; + desc.linked_functions = kernel_function_list; + desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT]; + desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW]; + desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL]; + desc.constant_values = constant_values; + desc.threads_per_threadgroup = threads_per_threadgroup; + desc.function_name = function_name.c_str(); + + bool success = kernel.load(device, desc, md5); + + any_error |= !success; + }); + }); + } + + bool loaded = !any_error; + if (loaded) { + loaded_md5[kernel_type] = hash; + } + return loaded; } -const MetalKernelPipeline &MetalDeviceKernels::get_best_pipeline(const MetalDevice *device, - DeviceKernel kernel) const +const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const { - return *get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device); + return kernels_[(int)kernel]; } -bool MetalDeviceKernels::available(const MetalDevice *device, DeviceKernel kernel) const +bool MetalDeviceKernels::available(DeviceKernel kernel) const { - return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device) != nullptr; + return kernels_[(int)kernel].get_pso().function != nil; } CCL_NAMESPACE_END diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm index 2079aa65499..1686ab95ffa 100644 --- a/intern/cycles/device/metal/queue.mm +++ b/intern/cycles/device/metal/queue.mm @@ -108,6 +108,9 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size " << work_size; + const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel); + const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso(); + id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel); /* Determine size requirement for argument buffer. */ @@ -209,8 +212,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, } bytes_written = globals_offsets + sizeof(KernelParamsMetal); - const MetalKernelPipeline &metal_kernel_pso = metal_device->get_best_pipeline(kernel); - /* Encode ancillaries */ [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets]; [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d @@ -283,7 +284,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel, [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline]; /* Compute kernel launch parameters. */ - const int num_threads_per_block = metal_kernel_pso.num_threads_per_block; + const int num_threads_per_block = metal_kernel.get_num_threads_per_block(); int shared_mem_bytes = 0; @@ -546,8 +547,6 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial]; - [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))]; - /* declare usage of MTLBuffers etc */ prepare_resources(kernel); } diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 0ed52074a90..4e309f16c08 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -29,26 +29,10 @@ using namespace metal::raytracing; /* Qualifiers */ -#if defined(__KERNEL_METAL_APPLE__) - -/* Inline everything for Apple GPUs. - * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface - * at the cost of longer compile times (~4.5 minutes on M1 Max). */ - -# define ccl_device __attribute__((always_inline)) -# define ccl_device_inline __attribute__((always_inline)) -# define ccl_device_forceinline __attribute__((always_inline)) -# define ccl_device_noinline __attribute__((always_inline)) - -#else - -# define ccl_device -# define ccl_device_inline ccl_device -# define ccl_device_forceinline ccl_device -# define ccl_device_noinline ccl_device __attribute__((noinline)) - -#endif - +#define ccl_device +#define ccl_device_inline ccl_device +#define ccl_device_forceinline ccl_device +#define ccl_device_noinline ccl_device __attribute__((noinline)) #define ccl_device_noinline_cpu ccl_device #define ccl_device_inline_method ccl_device #define ccl_global device |