Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorBrecht Van Lommel <brecht@blender.org>2022-04-28 01:46:14 +0300
committerBrecht Van Lommel <brecht@blender.org>2022-04-28 01:46:43 +0300
commit52a5f68562680c0ccd6d4e525098bb5e2af7d0bd (patch)
treee370e4a7fca575aad8ff8e9f1b975b4fd98a9c04 /intern
parent3558f565f1e8a8e5dc49067cc0500cbf993af69e (diff)
Revert "Cycles: Enable inlining on Apple Silicon for 1.1x speedup"
This reverts commit b82de02e7ce857e20b842a074c0068b146a9fd79. It is causing crashes in various regression tests. Ref D14763
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/device/metal/device_impl.h9
-rw-r--r--intern/cycles/device/metal/device_impl.mm101
-rw-r--r--intern/cycles/device/metal/kernel.h103
-rw-r--r--intern/cycles/device/metal/kernel.mm849
-rw-r--r--intern/cycles/device/metal/queue.mm9
-rw-r--r--intern/cycles/kernel/device/metal/compat.h24
6 files changed, 539 insertions, 556 deletions
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index d7311ee985f..27c58ce6d2f 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -28,8 +28,7 @@ class MetalDevice : public Device {
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
nil; /* encoder used for fetching device pointers from MTLBuffers */
- string source[PSO_NUM];
- string source_md5[PSO_NUM];
+ string source_used_for_compile[PSO_NUM];
KernelParamsMetal launch_params = {0};
@@ -111,12 +110,6 @@ class MetalDevice : public Device {
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
- id<MTLLibrary> compile(string const &source);
-
- const MetalKernelPipeline &get_best_pipeline(DeviceKernel kernel) const;
-
- bool kernel_available(DeviceKernel kernel) const;
-
/* ------------------------------------------------------------------ */
/* low-level memory management */
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 7d1212cb37c..c01f51fb506 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -275,44 +275,96 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
* active, but may still need to be rendered without motion blur if that isn't active as well. */
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
- source[PSO_GENERIC] = get_source(kernel_features);
- mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
-
- MD5Hash md5;
- md5.append(source[PSO_GENERIC]);
- source_md5[PSO_GENERIC] = md5.get_hex();
-
- metal_printf("Front-end compilation finished (generic)\n");
-
- bool result = kernels.load(this, false);
-
- reserve_local_memory(kernel_features);
+ NSError *error = NULL;
- return result;
-}
+ for (int i = 0; i < PSO_NUM; i++) {
+ if (mtlLibrary[i]) {
+ [mtlLibrary[i] release];
+ mtlLibrary[i] = nil;
+ }
+ }
-id<MTLLibrary> MetalDevice::compile(string const &source)
-{
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
options.fastMathEnabled = YES;
if (@available(macOS 12.0, *)) {
options.languageVersion = MTLLanguageVersion2_4;
}
+ else {
+ return false;
+ }
- NSError *error = NULL;
- id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
+ string metalsrc;
+
+ /* local helper: dump source to disk and return filepath */
+ auto dump_source = [&](int kernel_type) -> string {
+ string &source = source_used_for_compile[kernel_type];
+ string metalsrc = path_cache_get(path_join("kernels",
+ string_printf("%s.%s.metal",
+ kernel_type_as_string(kernel_type),
+ util_md5_string(source).c_str())));
+ path_write_text(metalsrc, source);
+ return metalsrc;
+ };
+
+ /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
+ * then compile it into a MTLLibrary */
+ auto fetch_and_compile_source = [&](int kernel_type) {
+ /* Record the source used to compile this library, for hash building later. */
+ string &source = source_used_for_compile[kernel_type];
+
+ switch (kernel_type) {
+ case PSO_GENERIC: {
+ source = get_source(kernel_features);
+ break;
+ }
+ case PSO_SPECIALISED: {
+ /* PSO_SPECIALISED derives from PSO_GENERIC */
+ string &generic_source = source_used_for_compile[PSO_GENERIC];
+ if (generic_source.empty()) {
+ generic_source = get_source(kernel_features);
+ }
+ source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
+ break;
+ }
+ default:
+ assert(0);
+ }
+
+ /* create MTLLibrary (front-end compilation) */
+ mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
options:options
error:&error];
- if (!mtlLibrary) {
- NSString *err = [error localizedDescription];
- set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
+ bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
+
+ if (!mtlLibrary[kernel_type] || do_source_dump) {
+ string metalsrc = dump_source(kernel_type);
+
+ if (!mtlLibrary[kernel_type]) {
+ NSString *err = [error localizedDescription];
+ set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
+
+ return false;
+ }
+ }
+ return true;
+ };
+
+ fetch_and_compile_source(PSO_GENERIC);
+
+ if (use_function_specialisation) {
+ fetch_and_compile_source(PSO_SPECIALISED);
}
+ metal_printf("Front-end compilation finished\n");
+
+ bool result = kernels.load(this, PSO_GENERIC);
+
[options release];
+ reserve_local_memory(kernel_features);
- return mtlLibrary;
+ return result;
}
void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -619,11 +671,6 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
return 0;
}
-const MetalKernelPipeline &MetalDevice::get_best_pipeline(DeviceKernel kernel) const
-{
- return kernels.get_best_pipeline(this, kernel);
-}
-
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
{
if (strcmp(name, "__data") == 0) {
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index 7e398d1cf41..b12491d820d 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -54,41 +54,98 @@ enum {
const char *kernel_type_as_string(int kernel_type);
struct MetalKernelPipeline {
+ void release()
+ {
+ if (pipeline) {
+ [pipeline release];
+ pipeline = nil;
+ if (@available(macOS 11.0, *)) {
+ for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+ if (intersection_func_table[i]) {
+ [intersection_func_table[i] release];
+ intersection_func_table[i] = nil;
+ }
+ }
+ }
+ }
+ if (function) {
+ [function release];
+ function = nil;
+ }
+ if (@available(macOS 11.0, *)) {
+ for (int i = 0; i < METALRT_TABLE_NUM; i++) {
+ if (intersection_func_table[i]) {
+ [intersection_func_table[i] release];
+ }
+ }
+ }
+ }
- void compile();
-
- id<MTLLibrary> mtlLibrary = nil;
- bool scene_specialized;
- string source_md5;
-
- bool use_metalrt;
- bool metalrt_hair;
- bool metalrt_hair_thick;
- bool metalrt_pointcloud;
-
- int threads_per_threadgroup;
-
- DeviceKernel device_kernel;
bool loaded = false;
- id<MTLDevice> mtlDevice = nil;
id<MTLFunction> function = nil;
id<MTLComputePipelineState> pipeline = nil;
- int num_threads_per_block = 0;
-
- string error_str;
API_AVAILABLE(macos(11.0))
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
- id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
+};
+
+struct MetalKernelLoadDesc {
+ int pso_index = 0;
+ const char *function_name = nullptr;
+ int kernel_index = 0;
+ int threads_per_threadgroup = 0;
+ MTLFunctionConstantValues *constant_values = nullptr;
+ NSArray *linked_functions = nullptr;
+
+ struct IntersectorFunctions {
+ NSArray *defaults;
+ NSArray *shadow;
+ NSArray *local;
+ NSArray *operator[](int index) const
+ {
+ if (index == METALRT_TABLE_DEFAULT)
+ return defaults;
+ if (index == METALRT_TABLE_SHADOW)
+ return shadow;
+ return local;
+ }
+ } intersector_functions = {nullptr};
+};
+
+/* Metal kernel and associate occupancy information. */
+class MetalDeviceKernel {
+ public:
+ ~MetalDeviceKernel();
+
+ bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
+
+ void mark_loaded(int pso_index)
+ {
+ pso[pso_index].loaded = true;
+ }
+
+ int get_num_threads_per_block() const
+ {
+ return num_threads_per_block;
+ }
+ const MetalKernelPipeline &get_pso() const;
+
+ double load_duration = 0.0;
+
+ private:
+ MetalKernelPipeline pso[PSO_NUM];
+
+ int num_threads_per_block = 0;
};
/* Cache of Metal kernels for each DeviceKernel. */
class MetalDeviceKernels {
public:
- bool load(MetalDevice *device, bool scene_specialized);
- bool available(const MetalDevice *device, DeviceKernel kernel) const;
- const MetalKernelPipeline &get_best_pipeline(const MetalDevice *device,
- DeviceKernel kernel) const;
+ bool load(MetalDevice *device, int kernel_type);
+ bool available(DeviceKernel kernel) const;
+ const MetalDeviceKernel &get(DeviceKernel kernel) const;
+
+ MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 44a5e23d00f..9555ca03c8e 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -9,7 +9,6 @@
# include "util/path.h"
# include "util/tbb.h"
# include "util/time.h"
-# include "util/unique_ptr.h"
CCL_NAMESPACE_BEGIN
@@ -29,370 +28,82 @@ const char *kernel_type_as_string(int kernel_type)
return "";
}
-bool kernel_has_intersection(DeviceKernel device_kernel)
+MetalDeviceKernel::~MetalDeviceKernel()
{
- return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
- device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
- device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
- device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
- device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
-}
-
-struct ShaderCache {
- ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
- {
- }
- ~ShaderCache();
-
- /* Get the fastest available pipeline for the specified kernel. */
- MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
-
- /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
- * device. */
- void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
-
- void wait_for_all();
-
- private:
- friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
-
- void compile_thread_func(int thread_index);
-
- using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
-
- struct PipelineRequest {
- MetalKernelPipeline *pipeline = nullptr;
- std::function<void(MetalKernelPipeline *)> completionHandler;
- };
-
- std::mutex cache_mutex;
-
- PipelineCollection pipelines[DEVICE_KERNEL_NUM];
- id<MTLDevice> mtlDevice;
-
- bool running = false;
- std::condition_variable cond_var;
- std::deque<PipelineRequest> request_queue;
- std::vector<std::thread> compile_threads;
- std::atomic_int incomplete_requests = 0;
-};
-
-std::mutex g_shaderCacheMutex;
-std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
-
-ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
-{
- thread_scoped_lock lock(g_shaderCacheMutex);
- auto it = g_shaderCache.find(mtlDevice);
- if (it != g_shaderCache.end()) {
- return it->second.get();
- }
-
- g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
- return g_shaderCache[mtlDevice].get();
-}
-
-ShaderCache::~ShaderCache()
-{
- running = false;
- cond_var.notify_all();
- for (auto &thread : compile_threads) {
- thread.join();
- }
-}
-
-void ShaderCache::wait_for_all()
-{
- while (incomplete_requests > 0) {
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ for (int i = 0; i < PSO_NUM; i++) {
+ pso[i].release();
}
}
-void ShaderCache::compile_thread_func(int thread_index)
+bool MetalDeviceKernel::load(MetalDevice *device,
+ MetalKernelLoadDesc const &desc_in,
+ MD5Hash const &md5)
{
- while (1) {
-
- /* wait for / acquire next request */
- PipelineRequest request;
- {
- thread_scoped_lock lock(cache_mutex);
- cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
- if (!running) {
- break;
- }
-
- if (!request_queue.empty()) {
- request = request_queue.front();
- request_queue.pop_front();
- }
- }
-
- /* service request */
- if (request.pipeline) {
- request.pipeline->compile();
- incomplete_requests--;
- }
- }
-}
-
-void ShaderCache::load_kernel(DeviceKernel device_kernel,
- MetalDevice *device,
- bool scene_specialized)
-{
- {
- /* create compiler threads on first run */
- thread_scoped_lock lock(cache_mutex);
- if (compile_threads.empty()) {
- running = true;
- for (int i = 0; i < max_mtlcompiler_threads; i++) {
- compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
- }
- }
- }
-
- if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ __block MetalKernelLoadDesc const desc(desc_in);
+ if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
/* skip megakernel */
- return;
+ return true;
}
- if (scene_specialized) {
- /* Only specialize kernels where it can make an impact. */
- if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
- device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- return;
- }
- }
-
- {
- /* check whether the kernel has already been requested / cached */
- thread_scoped_lock lock(cache_mutex);
- for (auto &pipeline : pipelines[device_kernel]) {
- if (scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
- /* we already requested a pipeline that is specialised for this kernel data */
- metal_printf("Specialized kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
- }
- else {
- if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
- /* we already requested a generic pipeline for this kernel */
- metal_printf("Generic kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
- }
- }
- }
-
- incomplete_requests++;
-
- PipelineRequest request;
- request.pipeline = new MetalKernelPipeline;
- request.pipeline->scene_specialized = scene_specialized;
- request.pipeline->mtlDevice = mtlDevice;
- request.pipeline->source_md5 =
- device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
- request.pipeline->mtlLibrary =
- device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
- request.pipeline->device_kernel = device_kernel;
- request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
-
- /* metalrt options */
- request.pipeline->use_metalrt = device->use_metalrt;
- request.pipeline->metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
- request.pipeline->metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
- request.pipeline->metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
-
- {
- thread_scoped_lock lock(cache_mutex);
- pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
- request_queue.push_back(request);
+ bool use_binary_archive = true;
+ if (device->device_vendor == METAL_GPU_APPLE) {
+ /* Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render */
+ use_binary_archive = false;
}
- cond_var.notify_one();
-}
-MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
-{
- thread_scoped_lock lock(cache_mutex);
- auto &collection = pipelines[kernel];
- if (collection.empty()) {
- return nullptr;
+ if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+ use_binary_archive = (atoi(str) == 0);
}
- /* metalrt options */
- bool use_metalrt = device->use_metalrt;
- bool metalrt_hair = device->kernel_features & KERNEL_FEATURE_HAIR;
- bool metalrt_hair_thick = device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
- bool metalrt_pointcloud = device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
-
- MetalKernelPipeline *best_pipeline = nullptr;
- for (auto &pipeline : collection) {
- if (!pipeline->loaded) {
- /* still loading - ignore */
- continue;
- }
-
- if (pipeline->use_metalrt != use_metalrt || pipeline->metalrt_hair != metalrt_hair ||
- pipeline->metalrt_hair_thick != metalrt_hair_thick ||
- pipeline->metalrt_pointcloud != metalrt_pointcloud) {
- /* wrong combination of metalrt options */
- continue;
- }
+ id<MTLBinaryArchive> archive = nil;
+ string metalbin_path;
+ if (use_binary_archive) {
+ NSProcessInfo *processInfo = [NSProcessInfo processInfo];
+ string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
+ MD5Hash local_md5(md5);
+ local_md5.append(osVersion);
+ string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() +
+ to_string(desc.pso_index) + ".bin";
+ metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
+ path_create_directories(metalbin_path);
- if (pipeline->scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
- best_pipeline = pipeline.get();
+ if (path_exists(metalbin_path) && use_binary_archive) {
+ if (@available(macOS 11.0, *)) {
+ MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+ archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
+ archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+ [archiveDesc release];
}
}
- else if (!best_pipeline) {
- best_pipeline = pipeline.get();
- }
- }
-
- return best_pipeline;
-}
-
-void MetalKernelPipeline::compile()
-{
- int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
-
- const std::string function_name = std::string("cycles_metal_") +
- device_kernel_as_string(device_kernel);
-
- int threads_per_threadgroup = this->threads_per_threadgroup;
- if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
- device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
- /* Always use 512 for the sorting kernels */
- threads_per_threadgroup = 512;
}
- NSString *entryPoint = [@(function_name.c_str()) copy];
+ NSString *entryPoint = [@(desc.function_name) copy];
NSError *error = NULL;
if (@available(macOS 11.0, *)) {
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
func_desc.name = entryPoint;
- function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
+ if (desc.constant_values) {
+ func_desc.constantValues = desc.constant_values;
+ }
+ pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index]
+ newFunctionWithDescriptor:func_desc
+ error:&error];
}
-
[entryPoint release];
- if (function == nil) {
+ if (pso[desc.pso_index].function == nil) {
NSString *err = [error localizedDescription];
string errors = [err UTF8String];
- metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
- return;
- }
- function.label = [entryPoint copy];
-
- if (use_metalrt) {
- if (@available(macOS 11.0, *)) {
- /* create the id<MTLFunction> for each intersection function */
- const char *function_names[] = {
- "__anyhit__cycles_metalrt_visibility_test_tri",
- "__anyhit__cycles_metalrt_visibility_test_box",
- "__anyhit__cycles_metalrt_shadow_all_hit_tri",
- "__anyhit__cycles_metalrt_shadow_all_hit_box",
- "__anyhit__cycles_metalrt_local_hit_tri",
- "__anyhit__cycles_metalrt_local_hit_box",
- "__intersection__curve_ribbon",
- "__intersection__curve_ribbon_shadow",
- "__intersection__curve_all",
- "__intersection__curve_all_shadow",
- "__intersection__point",
- "__intersection__point_shadow",
- };
- assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
-
- MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
- for (int i = 0; i < METALRT_FUNC_NUM; i++) {
- const char *function_name = function_names[i];
- desc.name = [@(function_name) copy];
-
- NSError *error = NULL;
- rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc error:&error];
-
- if (rt_intersection_function[i] == nil) {
- NSString *err = [error localizedDescription];
- string errors = [err UTF8String];
-
- error_str = string_printf(
- "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
- break;
- }
-
- rt_intersection_function[i].label = [@(function_name) copy];
- }
- }
+ device->set_error(
+ string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str()));
+ return false;
}
- NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
- NSArray *linked_functions = nil;
-
- if (use_metalrt) {
- id<MTLFunction> curve_intersect_default = nil;
- id<MTLFunction> curve_intersect_shadow = nil;
- id<MTLFunction> point_intersect_default = nil;
- id<MTLFunction> point_intersect_shadow = nil;
- if (metalrt_hair) {
- /* Add curve intersection programs. */
- if (metalrt_hair_thick) {
- /* Slower programs for thick hair since that also slows down ribbons.
- * Ideally this should not be needed. */
- curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL];
- curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW];
- }
- else {
- curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON];
- curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW];
- }
- }
- if (metalrt_pointcloud) {
- point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT];
- point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW];
- }
- table_functions[METALRT_TABLE_DEFAULT] = [NSArray
- arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI],
- curve_intersect_default ?
- curve_intersect_default :
- rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
- point_intersect_default ?
- point_intersect_default :
- rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
- nil];
- table_functions[METALRT_TABLE_SHADOW] = [NSArray
- arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI],
- curve_intersect_shadow ?
- curve_intersect_shadow :
- rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
- point_intersect_shadow ?
- point_intersect_shadow :
- rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
- nil];
- table_functions[METALRT_TABLE_LOCAL] = [NSArray
- arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI],
- rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
- rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
- nil];
-
- NSMutableSet *unique_functions = [NSMutableSet
- setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
- [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
- [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
-
- if (kernel_has_intersection(device_kernel)) {
- linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
- sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
- return [f1.label compare:f2.label];
- }];
- }
- unique_functions = nil;
- }
+ pso[desc.pso_index].function.label = [@(desc.function_name) copy];
- MTLComputePipelineDescriptor *computePipelineStateDescriptor =
+ __block MTLComputePipelineDescriptor *computePipelineStateDescriptor =
[[MTLComputePipelineDescriptor alloc] init];
computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
@@ -400,86 +111,52 @@ void MetalKernelPipeline::compile()
computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
if (@available(macos 10.14, *)) {
- computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
+ computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup;
}
computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
- computePipelineStateDescriptor.computeFunction = function;
-
+ computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function;
if (@available(macOS 11.0, *)) {
/* Attach the additional functions to an MTLLinkedFunctions object */
- if (linked_functions) {
+ if (desc.linked_functions) {
computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
- computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
+ computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions;
}
+
computePipelineStateDescriptor.maxCallStackDepth = 1;
- if (use_metalrt) {
- computePipelineStateDescriptor.maxCallStackDepth = 8;
- }
}
+ /* Create a new Compute pipeline state object */
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
- bool use_binary_archive = true;
- if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
- use_binary_archive = (atoi(str) == 0);
- }
-
- id<MTLBinaryArchive> archive = nil;
- string metalbin_path;
- string metalbin_name;
- if (use_binary_archive) {
- NSProcessInfo *processInfo = [NSProcessInfo processInfo];
- string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
- MD5Hash local_md5;
- local_md5.append(source_md5);
- local_md5.append(osVersion);
- local_md5.append((uint8_t *)&this->threads_per_threadgroup,
- sizeof(this->threads_per_threadgroup));
-
- string options;
- if (use_metalrt && kernel_has_intersection(device_kernel)) {
- /* incorporate any MetalRT specialisations into the archive name */
- options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d",
- metalrt_hair ? 1 : 0,
- metalrt_hair_thick ? 1 : 0,
- metalrt_pointcloud ? 1 : 0);
- }
-
- /* Replace non-alphanumerical characters with underscores. */
- string device_name = [mtlDevice.name UTF8String];
- for (char &c : device_name) {
- if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
- c = '_';
- }
- }
-
- metalbin_name = device_name;
- metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
- metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
- metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin");
-
- metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
- path_create_directories(metalbin_path);
-
- if (path_exists(metalbin_path) && use_binary_archive) {
- if (@available(macOS 11.0, *)) {
- MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
- archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
- archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
- [archiveDesc release];
- }
- }
- }
-
- __block bool creating_new_archive = false;
+ bool creating_new_archive = false;
if (@available(macOS 11.0, *)) {
if (use_binary_archive) {
if (!archive) {
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
archiveDesc.url = nil;
- archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+ archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
creating_new_archive = true;
+
+ double starttime = time_dt();
+
+ if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
+ error:&error]) {
+ NSString *errStr = [error localizedDescription];
+ metal_printf("Failed to add PSO to archive:\n%s\n",
+ errStr ? [errStr UTF8String] : "nil");
+ }
+ else {
+ double duration = time_dt() - starttime;
+ metal_printf("%2d | %-55s | %7.2fs\n",
+ desc.kernel_index,
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
+ duration);
+
+ if (desc.pso_index == PSO_GENERIC) {
+ this->load_duration = duration;
+ }
+ }
}
computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
@@ -493,14 +170,17 @@ void MetalKernelPipeline::compile()
MTLComputePipelineReflection *reflection,
NSError *error) {
bool recreate_archive = false;
- if (computePipelineState == nil && archive) {
+ if (computePipelineState == nil && archive && !creating_new_archive) {
+
+ assert(0);
+
NSString *errStr = [error localizedDescription];
metal_printf(
"Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
"(error: %s)\n",
- device_kernel_as_string((DeviceKernel)device_kernel),
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
errStr ? [errStr UTF8String] : "nil");
- computePipelineState = [mtlDevice
+ computePipelineState = [device->mtlDevice
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:MTLPipelineOptionNone
reflection:nullptr
@@ -512,23 +192,32 @@ void MetalKernelPipeline::compile()
if (computePipelineState == nil) {
NSString *errStr = [error localizedDescription];
- error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
- device_kernel_as_string((DeviceKernel)device_kernel));
- error_str += (errStr ? [errStr UTF8String] : "nil");
- metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
- kernel_type_as_string(pso_type),
- device_kernel,
- device_kernel_as_string((DeviceKernel)device_kernel),
+ device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n",
+ device_kernel_as_string((DeviceKernel)desc.kernel_index)) +
+ (errStr ? [errStr UTF8String] : "nil"));
+ metal_printf("%2d | %-55s | %7.2fs | FAILED!\n",
+ desc.kernel_index,
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
duration);
return;
}
- int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
- computePipelineState.threadExecutionWidth);
+ pso[desc.pso_index].pipeline = computePipelineState;
+ num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+ computePipelineState.threadExecutionWidth);
num_threads_per_block = std::max(num_threads_per_block,
(int)computePipelineState.threadExecutionWidth);
- this->pipeline = computePipelineState;
- this->num_threads_per_block = num_threads_per_block;
+
+ if (!use_binary_archive) {
+ metal_printf("%2d | %-55s | %7.2fs\n",
+ desc.kernel_index,
+ device_kernel_as_string((DeviceKernel)desc.kernel_index),
+ duration);
+
+ if (desc.pso_index == PSO_GENERIC) {
+ this->load_duration = duration;
+ }
+ }
if (@available(macOS 11.0, *)) {
if (creating_new_archive || recreate_archive) {
@@ -539,90 +228,304 @@ void MetalKernelPipeline::compile()
}
}
}
- };
- /* Block on load to ensure we continue with a valid kernel function */
- if (creating_new_archive) {
- starttime = time_dt();
- NSError *error;
- if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
- error:&error]) {
- NSString *errStr = [error localizedDescription];
- metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
- }
- }
- id<MTLComputePipelineState> pipeline = [mtlDevice
- newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
- options:pipelineOptions
- reflection:nullptr
- error:&error];
- completionHandler(pipeline, nullptr, error);
-
- this->loaded = true;
- [computePipelineStateDescriptor release];
- computePipelineStateDescriptor = nil;
-
- if (use_metalrt && linked_functions) {
- for (int table = 0; table < METALRT_TABLE_NUM; table++) {
- if (@available(macOS 11.0, *)) {
- MTLIntersectionFunctionTableDescriptor *ift_desc =
- [[MTLIntersectionFunctionTableDescriptor alloc] init];
- ift_desc.functionCount = table_functions[table].count;
- intersection_func_table[table] = [this->pipeline
- newIntersectionFunctionTableWithDescriptor:ift_desc];
-
- /* Finally write the function handles into this pipeline's table */
- for (int i = 0; i < 2; i++) {
- id<MTLFunctionHandle> handle = [pipeline
- functionHandleWithFunction:table_functions[table][i]];
- [intersection_func_table[table] setFunction:handle atIndex:i];
+ [computePipelineStateDescriptor release];
+ computePipelineStateDescriptor = nil;
+
+ if (device->use_metalrt && desc.linked_functions) {
+ for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+ if (@available(macOS 11.0, *)) {
+ MTLIntersectionFunctionTableDescriptor *ift_desc =
+ [[MTLIntersectionFunctionTableDescriptor alloc] init];
+ ift_desc.functionCount = desc.intersector_functions[table].count;
+
+ pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline
+ newIntersectionFunctionTableWithDescriptor:ift_desc];
+
+ /* Finally write the function handles into this pipeline's table */
+ for (int i = 0; i < 2; i++) {
+ id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline
+ functionHandleWithFunction:desc.intersector_functions[table][i]];
+ [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i];
+ }
}
}
}
- }
- double duration = time_dt() - starttime;
+ mark_loaded(desc.pso_index);
+ };
- if (!use_binary_archive) {
- metal_printf("%16s | %2d | %-55s | %7.2fs\n",
- kernel_type_as_string(pso_type),
- int(device_kernel),
- device_kernel_as_string(device_kernel),
- duration);
+ if (desc.pso_index == PSO_SPECIALISED) {
+ /* Asynchronous load */
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+ NSError *error;
+ id<MTLComputePipelineState> pipeline = [device->mtlDevice
+ newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+ options:pipelineOptions
+ reflection:nullptr
+ error:&error];
+ completionHandler(pipeline, nullptr, error);
+ });
}
else {
- metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n",
- kernel_type_as_string(pso_type),
- device_kernel,
- device_kernel_as_string((DeviceKernel)device_kernel),
- duration,
- creating_new_archive ? " new" : "load",
- metalbin_name.c_str());
+ /* Block on load to ensure we continue with a valid kernel function */
+ id<MTLComputePipelineState> pipeline = [device->mtlDevice
+ newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+ options:pipelineOptions
+ reflection:nullptr
+ error:&error];
+ completionHandler(pipeline, nullptr, error);
}
+
+ return true;
}
-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+const MetalKernelPipeline &MetalDeviceKernel::get_pso() const
{
- auto shader_cache = get_shader_cache(device->mtlDevice);
- for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
- shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+ if (pso[PSO_SPECIALISED].loaded) {
+ return pso[PSO_SPECIALISED];
}
- if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
- shader_cache->wait_for_all();
+ assert(pso[PSO_GENERIC].loaded);
+ return pso[PSO_GENERIC];
+}
+
+bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
+{
+ bool any_error = false;
+
+ MD5Hash md5;
+
+ /* Build the function constant table */
+ MTLFunctionConstantValues *constant_values = nullptr;
+ if (kernel_type == PSO_SPECIALISED) {
+ constant_values = [MTLFunctionConstantValues new];
+
+# define KERNEL_FILM(_type, name) \
+ [constant_values setConstantValue:&data.film.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_film_##name]; \
+ md5.append((uint8_t *)&data.film.name, sizeof(data.film.name));
+
+# define KERNEL_BACKGROUND(_type, name) \
+ [constant_values setConstantValue:&data.background.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_background_##name]; \
+ md5.append((uint8_t *)&data.background.name, sizeof(data.background.name));
+
+# define KERNEL_INTEGRATOR(_type, name) \
+ [constant_values setConstantValue:&data.integrator.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_integrator_##name]; \
+ md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name));
+
+# define KERNEL_BVH(_type, name) \
+ [constant_values setConstantValue:&data.bvh.name \
+ type:get_MTLDataType_##_type() \
+ atIndex:KernelData_bvh_##name]; \
+ md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name));
+
+ /* METAL_WIP: populate constant_values based on KernelData */
+ assert(0);
+ /*
+ const KernelData &data = device->launch_params.data;
+ # include "kernel/types/background.h"
+ # include "kernel/types/bvh.h"
+ # include "kernel/types/film.h"
+ # include "kernel/types/integrator.h"
+ */
}
- return true;
+
+ if (device->use_metalrt) {
+ if (@available(macOS 11.0, *)) {
+ /* create the id<MTLFunction> for each intersection function */
+ const char *function_names[] = {
+ "__anyhit__cycles_metalrt_visibility_test_tri",
+ "__anyhit__cycles_metalrt_visibility_test_box",
+ "__anyhit__cycles_metalrt_shadow_all_hit_tri",
+ "__anyhit__cycles_metalrt_shadow_all_hit_box",
+ "__anyhit__cycles_metalrt_local_hit_tri",
+ "__anyhit__cycles_metalrt_local_hit_box",
+ "__intersection__curve_ribbon",
+ "__intersection__curve_ribbon_shadow",
+ "__intersection__curve_all",
+ "__intersection__curve_all_shadow",
+ "__intersection__point",
+ "__intersection__point_shadow",
+ };
+ assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
+
+ MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
+ if (kernel_type == PSO_SPECIALISED) {
+ desc.constantValues = constant_values;
+ }
+ for (int i = 0; i < METALRT_FUNC_NUM; i++) {
+ const char *function_name = function_names[i];
+ desc.name = [@(function_name) copy];
+
+ NSError *error = NULL;
+ rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type]
+ newFunctionWithDescriptor:desc
+ error:&error];
+
+ if (rt_intersection_funcs[kernel_type][i] == nil) {
+ NSString *err = [error localizedDescription];
+ string errors = [err UTF8String];
+
+ device->set_error(string_printf(
+ "Error getting intersection function \"%s\": %s", function_name, errors.c_str()));
+ any_error = true;
+ break;
+ }
+
+ rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy];
+ }
+ }
+ }
+ md5.append(device->source_used_for_compile[kernel_type]);
+
+ string hash = md5.get_hex();
+ if (loaded_md5[kernel_type] == hash) {
+ return true;
+ }
+
+ if (!any_error) {
+ NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
+ NSArray *function_list = nil;
+
+ if (device->use_metalrt) {
+ id<MTLFunction> curve_intersect_default = nil;
+ id<MTLFunction> curve_intersect_shadow = nil;
+ id<MTLFunction> point_intersect_default = nil;
+ id<MTLFunction> point_intersect_shadow = nil;
+ if (device->kernel_features & KERNEL_FEATURE_HAIR) {
+ /* Add curve intersection programs. */
+ if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+ /* Slower programs for thick hair since that also slows down ribbons.
+ * Ideally this should not be needed. */
+ curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
+ curve_intersect_shadow =
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
+ }
+ else {
+ curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
+ curve_intersect_shadow =
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
+ }
+ }
+ if (device->kernel_features & KERNEL_FEATURE_POINTCLOUD) {
+ point_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT];
+ point_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT_SHADOW];
+ }
+ table_functions[METALRT_TABLE_DEFAULT] = [NSArray
+ arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
+ curve_intersect_default ?
+ curve_intersect_default :
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
+ point_intersect_default ?
+ point_intersect_default :
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
+ nil];
+ table_functions[METALRT_TABLE_SHADOW] = [NSArray
+ arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
+ curve_intersect_shadow ?
+ curve_intersect_shadow :
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
+ point_intersect_shadow ?
+ point_intersect_shadow :
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
+ nil];
+ table_functions[METALRT_TABLE_LOCAL] = [NSArray
+ arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
+ rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
+ nil];
+
+ NSMutableSet *unique_functions = [NSMutableSet
+ setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
+ [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
+ [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+
+ function_list = [[NSArray arrayWithArray:[unique_functions allObjects]]
+ sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
+ return [f1.label compare:f2.label];
+ }];
+
+ unique_functions = nil;
+ }
+
+ metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n",
+ kernel_type_as_string(kernel_type));
+
+ tbb::task_arena local_arena(max_mtlcompiler_threads);
+ local_arena.execute([&]() {
+ parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) {
+ /* skip megakernel */
+ if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ return;
+ }
+
+ /* Only specialize kernels where it can make an impact. */
+ if (kernel_type == PSO_SPECIALISED) {
+ if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ return;
+ }
+ }
+
+ MetalDeviceKernel &kernel = kernels_[i];
+
+ const std::string function_name = std::string("cycles_metal_") +
+ device_kernel_as_string((DeviceKernel)i);
+ int threads_per_threadgroup = device->max_threads_per_threadgroup;
+ if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) {
+ /* Always use 512 for the sorting kernels */
+ threads_per_threadgroup = 512;
+ }
+
+ NSArray *kernel_function_list = nil;
+
+ if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+ i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+ i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ kernel_function_list = function_list;
+ }
+
+ MetalKernelLoadDesc desc;
+ desc.pso_index = kernel_type;
+ desc.kernel_index = i;
+ desc.linked_functions = kernel_function_list;
+ desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT];
+ desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW];
+ desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL];
+ desc.constant_values = constant_values;
+ desc.threads_per_threadgroup = threads_per_threadgroup;
+ desc.function_name = function_name.c_str();
+
+ bool success = kernel.load(device, desc, md5);
+
+ any_error |= !success;
+ });
+ });
+ }
+
+ bool loaded = !any_error;
+ if (loaded) {
+ loaded_md5[kernel_type] = hash;
+ }
+ return loaded;
}
-const MetalKernelPipeline &MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
- DeviceKernel kernel) const
+const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const
{
- return *get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
+ return kernels_[(int)kernel];
}
-bool MetalDeviceKernels::available(const MetalDevice *device, DeviceKernel kernel) const
+bool MetalDeviceKernels::available(DeviceKernel kernel) const
{
- return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device) != nullptr;
+ return kernels_[(int)kernel].get_pso().function != nil;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 2079aa65499..1686ab95ffa 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -108,6 +108,9 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
<< work_size;
+ const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
+ const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
+
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
/* Determine size requirement for argument buffer. */
@@ -209,8 +212,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
- const MetalKernelPipeline &metal_kernel_pso = metal_device->get_best_pipeline(kernel);
-
/* Encode ancillaries */
[metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
@@ -283,7 +284,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
/* Compute kernel launch parameters. */
- const int num_threads_per_block = metal_kernel_pso.num_threads_per_block;
+ const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
int shared_mem_bytes = 0;
@@ -546,8 +547,6 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
MTLDispatchTypeSerial];
- [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
-
/* declare usage of MTLBuffers etc */
prepare_resources(kernel);
}
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 0ed52074a90..4e309f16c08 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -29,26 +29,10 @@ using namespace metal::raytracing;
/* Qualifiers */
-#if defined(__KERNEL_METAL_APPLE__)
-
-/* Inline everything for Apple GPUs.
- * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
- * at the cost of longer compile times (~4.5 minutes on M1 Max). */
-
-# define ccl_device __attribute__((always_inline))
-# define ccl_device_inline __attribute__((always_inline))
-# define ccl_device_forceinline __attribute__((always_inline))
-# define ccl_device_noinline __attribute__((always_inline))
-
-#else
-
-# define ccl_device
-# define ccl_device_inline ccl_device
-# define ccl_device_forceinline ccl_device
-# define ccl_device_noinline ccl_device __attribute__((noinline))
-
-#endif
-
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline ccl_device __attribute__((noinline))
#define ccl_device_noinline_cpu ccl_device
#define ccl_device_inline_method ccl_device
#define ccl_global device