Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device/metal')
-rw-r--r--intern/cycles/device/metal/device.mm14
-rw-r--r--intern/cycles/device/metal/device_impl.h12
-rw-r--r--intern/cycles/device/metal/device_impl.mm231
-rw-r--r--intern/cycles/device/metal/kernel.h30
-rw-r--r--intern/cycles/device/metal/kernel.mm221
-rw-r--r--intern/cycles/device/metal/queue.h1
-rw-r--r--intern/cycles/device/metal/queue.mm21
-rw-r--r--intern/cycles/device/metal/util.h12
-rw-r--r--intern/cycles/device/metal/util.mm74
9 files changed, 451 insertions, 165 deletions
diff --git a/intern/cycles/device/metal/device.mm b/intern/cycles/device/metal/device.mm
index d7f190fc01e..51e3323370a 100644
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@@ -34,7 +34,8 @@ void device_metal_info(vector<DeviceInfo> &devices)
int device_index = 0;
for (id<MTLDevice> &device : usable_devices) {
/* Compute unique ID for persistent user preferences. */
- string device_name = [device.name UTF8String];
+ string device_name = MetalInfo::get_device_name(device);
+
string id = string("METAL_") + device_name;
/* Hardware ID might not be unique, add device number in that case. */
@@ -48,12 +49,6 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.type = DEVICE_METAL;
info.description = string_remove_trademark(string(device_name));
- /* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and
- * GPU */
- if (info.description == system_cpu_brand_string()) {
- info.description += " (GPU)";
- }
-
info.num = device_index;
/* We don't know if it's used for display, but assume it is. */
info.display_device = true;
@@ -69,14 +64,15 @@ string device_metal_capabilities()
{
string result = "";
auto allDevices = MTLCopyAllDevices();
- uint32_t num_devices = allDevices.count;
+ uint32_t num_devices = (uint32_t)allDevices.count;
if (num_devices == 0) {
return "No Metal devices found\n";
}
result += string_printf("Number of devices: %u\n", num_devices);
for (id<MTLDevice> device in allDevices) {
- result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]);
+ string device_name = MetalInfo::get_device_name(device);
+ result += string_printf("\t\tDevice: %s\n", device_name.c_str());
}
return result;
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 0e6817d94f8..99e60d3a788 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -42,7 +42,6 @@ class MetalDevice : public Device {
nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
/*---------------------------------------------------*/
- string device_name;
MetalGPUVendor device_vendor;
uint kernel_features;
@@ -76,7 +75,8 @@ class MetalDevice : public Device {
std::vector<id<MTLTexture>> texture_slot_map;
bool use_metalrt = false;
- bool use_function_specialisation = false;
+ MetalPipelineType kernel_specialization_level = PSO_GENERIC;
+ std::atomic_bool async_compile_and_load = false;
virtual BVHLayoutMask get_bvh_layout_mask() const override;
@@ -92,9 +92,7 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
- string get_source(const uint kernel_features);
-
- string compile_kernel(const uint kernel_features, const char *name);
+ void make_source(MetalPipelineType pso_type, const uint kernel_features);
virtual bool load_kernels(const uint kernel_features) override;
@@ -112,7 +110,9 @@ class MetalDevice : public Device {
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
- id<MTLLibrary> compile(string const &source);
+ virtual void optimize_for_scene(Scene *scene) override;
+
+ bool compile_and_load(MetalPipelineType pso_type);
/* ------------------------------------------------------------------ */
/* low-level memory management */
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 086bf0af979..d1250b83d22 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -6,9 +6,12 @@
# include "device/metal/device_impl.h"
# include "device/metal/device.h"
+# include "scene/scene.h"
+
# include "util/debug.h"
# include "util/md5.h"
# include "util/path.h"
+# include "util/time.h"
CCL_NAMESPACE_BEGIN
@@ -35,7 +38,7 @@ void MetalDevice::set_error(const string &error)
}
MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
- : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+ : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
{
mtlDevId = info.num;
@@ -43,10 +46,9 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
auto usable_devices = MetalInfo::get_usable_devices();
assert(mtlDevId < usable_devices.size());
mtlDevice = usable_devices[mtlDevId];
- device_name = [mtlDevice.name UTF8String];
- device_vendor = MetalInfo::get_vendor_from_device_name(device_name);
+ device_vendor = MetalInfo::get_device_vendor(mtlDevice);
assert(device_vendor != METAL_GPU_UNKNOWN);
- metal_printf("Creating new Cycles device for Metal: %s\n", device_name.c_str());
+ metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str());
/* determine default storage mode based on whether UMA is supported */
@@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
case METAL_GPU_APPLE: {
max_threads_per_threadgroup = 512;
use_metalrt = info.use_metalrt;
+
+ /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
+ * quickly. */
+ kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
break;
}
}
@@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
capture_enabled = true;
}
+ if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
+ kernel_specialization_level = (MetalPipelineType)atoi(envstr);
+ }
+ metal_printf("kernel_specialization_level = %s\n",
+ kernel_type_as_string(
+ (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
+
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
arg_desc_params.dataType = MTLDataTypePointer;
arg_desc_params.access = MTLArgumentAccessReadOnly;
@@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation()
return DebugFlags().metal.adaptive_compile;
}
-string MetalDevice::get_source(const uint kernel_features)
+void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
- string build_options;
-
+ string global_defines;
if (use_adaptive_compilation()) {
- build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+ global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
}
if (use_metalrt) {
- build_options += "-D__METALRT__ ";
+ global_defines += "#define __METALRT__\n";
if (motion_blur) {
- build_options += "-D__METALRT_MOTION__ ";
+ global_defines += "#define __METALRT_MOTION__\n";
}
}
# ifdef WITH_CYCLES_DEBUG
- build_options += "-D__KERNEL_DEBUG__ ";
+ global_defines += "#define __KERNEL_DEBUG__\n";
# endif
switch (device_vendor) {
default:
break;
case METAL_GPU_INTEL:
- build_options += "-D__KERNEL_METAL_INTEL__ ";
+ global_defines += "#define __KERNEL_METAL_INTEL__\n";
break;
case METAL_GPU_AMD:
- build_options += "-D__KERNEL_METAL_AMD__ ";
+ global_defines += "#define __KERNEL_METAL_AMD__\n";
break;
case METAL_GPU_APPLE:
- build_options += "-D__KERNEL_METAL_APPLE__ ";
+ global_defines += "#define __KERNEL_METAL_APPLE__\n";
break;
}
- /* reformat -D defines list into compilable form */
- vector<string> components;
- string_replace(build_options, "-D", "");
- string_split(components, build_options, " ");
+ string &source = this->source[pso_type];
+ source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
+ source = path_source_replace_includes(source, path_get("source"));
- string globalDefines;
- for (const string &component : components) {
- vector<string> assignments;
- string_split(assignments, component, "=");
- if (assignments.size() == 2)
- globalDefines += string_printf(
- "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str());
- else
- globalDefines += string_printf("#define %s\n", assignments[0].c_str());
+ /* Perform any required specialization on the source.
+ * With Metal function constants we can generate a single variant of the kernel source which can
+ * be repeatedly respecialized.
+ */
+ string baked_constants;
+
+ /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
+ * the same character length. Build a string of all active constant values which is then hashed
+ * in order to identify the PSO.
+ */
+ if (pso_type != PSO_GENERIC) {
+ const double starttime = time_dt();
+
+# define KERNEL_STRUCT_BEGIN(name, parent) \
+ string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
+
+ /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ baked_constants += string(#parent "." #name "=") + \
+ to_string(_type(launch_params.data.parent.name)) + "\n";
+
+# include "kernel/data_template.h"
+
+ /* Opt in to all of available specializations. This can be made more granular for the
+ * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
+ * but the overhead should be negligible as these are very quick to (re)build and aren't
+ * serialized to disk via MTLBinaryArchives.
+ */
+ global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
+
+ metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
}
- string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n";
- source = path_source_replace_includes(source, path_get("source"));
-
- metal_printf("Global defines:\n%s\n", globalDefines.c_str());
+ source = global_defines + source;
+ metal_printf("================\n%s================\n\%s================\n",
+ global_defines.c_str(),
+ baked_constants.c_str());
- return source;
+ /* Generate an MD5 from the source and include any baked constants. This is used when caching
+ * PSOs. */
+ MD5Hash md5;
+ md5.append(baked_constants);
+ md5.append(source);
+ source_md5[pso_type] = md5.get_hex();
}
bool MetalDevice::load_kernels(const uint _kernel_features)
@@ -279,24 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
* active, but may still need to be rendered without motion blur if that isn't active as well. */
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
- source[PSO_GENERIC] = get_source(kernel_features);
- mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
-
- MD5Hash md5;
- md5.append(source[PSO_GENERIC]);
- source_md5[PSO_GENERIC] = md5.get_hex();
-
- metal_printf("Front-end compilation finished (generic)\n");
-
- bool result = MetalDeviceKernels::load(this, false);
+ bool result = compile_and_load(PSO_GENERIC);
reserve_local_memory(kernel_features);
-
return result;
}
-id<MTLLibrary> MetalDevice::compile(string const &source)
+bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
{
+ make_source(pso_type, kernel_features);
+
+ if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
+ /* We already have a full set of matching pipelines which are cached or queued. */
+ metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
+ return true;
+ }
+
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
options.fastMathEnabled = YES;
@@ -304,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source)
options.languageVersion = MTLLanguageVersion2_4;
}
+ if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
+ path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
+ source[pso_type]);
+ }
+
+ const double starttime = time_dt();
+
NSError *error = NULL;
- id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
- options:options
- error:&error];
+ mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
+ options:options
+ error:&error];
- if (!mtlLibrary) {
+ if (!mtlLibrary[pso_type]) {
NSString *err = [error localizedDescription];
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
}
+ metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
+
[options release];
- return mtlLibrary;
+ return MetalDeviceKernels::load(this, pso_type);
}
void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -411,9 +458,9 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
}
if (mem.name) {
- VLOG(2) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
}
mem.device_size = metal_buffer.allocatedSize;
@@ -623,11 +670,63 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
return 0;
}
+void MetalDevice::optimize_for_scene(Scene *scene)
+{
+ MetalPipelineType specialization_level = kernel_specialization_level;
+
+ if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
+ return;
+ }
+
+ /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
+ * synchronously. */
+ compile_and_load(PSO_SPECIALIZED_INTERSECT);
+
+ if (specialization_level < PSO_SPECIALIZED_SHADE) {
+ return;
+ }
+ if (!scene->params.background) {
+ /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
+ * build. */
+ return;
+ }
+
+ /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
+ * only if there isn't an existing load in flight.
+ */
+ auto specialize_shade_fn = ^() {
+ compile_and_load(PSO_SPECIALIZED_SHADE);
+ async_compile_and_load = false;
+ };
+
+ bool async_specialize_shade = true;
+
+ /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
+ if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
+ async_specialize_shade = false;
+ }
+
+ if (async_specialize_shade) {
+ if (!async_compile_and_load) {
+ async_compile_and_load = true;
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
+ specialize_shade_fn);
+ }
+ else {
+ metal_printf(
+ "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
+ }
+ }
+ else {
+ specialize_shade_fn();
+ }
+}
+
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
{
- if (strcmp(name, "__data") == 0) {
+ if (strcmp(name, "data") == 0) {
assert(size == sizeof(KernelData));
- memcpy((uint8_t *)&launch_params + offsetof(KernelParamsMetal, data), host, size);
+ memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
return;
}
@@ -646,19 +745,19 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
};
/* Update data storage pointers in launch parameters. */
- if (strcmp(name, "__integrator_state") == 0) {
+ if (strcmp(name, "integrator_state") == 0) {
/* IntegratorStateGPU is contiguous pointers */
- const size_t pointer_block_size = sizeof(IntegratorStateGPU);
+ const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
update_launch_pointers(
- offsetof(KernelParamsMetal, __integrator_state), host, size, pointer_block_size);
+ offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
}
-# define KERNEL_TEX(data_type, tex_name) \
+# define KERNEL_DATA_ARRAY(data_type, tex_name) \
else if (strcmp(name, #tex_name) == 0) \
{ \
update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
}
-# include "kernel/textures.h"
-# undef KERNEL_TEX
+# include "kernel/data_arrays.h"
+# undef KERNEL_DATA_ARRAY
}
void MetalDevice::global_alloc(device_memory &mem)
@@ -800,9 +899,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
desc.textureType = MTLTextureType3D;
desc.depth = mem.data_depth;
- VLOG(2) << "Texture 3D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Texture 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
assert(mtlTexture);
@@ -834,9 +933,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
desc.storageMode = storage_mode;
desc.usage = MTLTextureUsageShaderRead;
- VLOG(2) << "Texture 2D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
+ VLOG_WORK << "Texture 2D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
assert(mtlTexture);
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index 69b2a686ecc..11393f8b7e1 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -31,7 +31,7 @@ enum {
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
/* Pipeline State Object types */
-enum {
+enum MetalPipelineType {
/* A kernel that can be used with all scenes, supporting all features.
* It is slow to compile, but only needs to be compiled once and is then
* cached for future render sessions. This allows a render to get underway
@@ -39,28 +39,33 @@ enum {
*/
PSO_GENERIC,
- /* A kernel that is relatively quick to compile, but is specialized for the
- * scene being rendered. It only contains the functionality and even baked in
- * constants for values that means it needs to be recompiled whenever a
- * dependent setting is changed. The render performance of this kernel is
- * significantly faster though, and justifies the extra compile time.
+ /* A intersection kernel that is very quick to specialize and results in faster intersection
+ * kernel performance. It uses Metal function constants to replace several KernelData variables
+ * with fixed constants.
+ */
+ PSO_SPECIALIZED_INTERSECT,
+
+ /* A shading kernel that is slow to specialize, but results in faster shading kernel performance
+ * rendered. It uses Metal function constants to replace several KernelData variables with fixed
+ * constants and short-circuit all unused SVM node case handlers.
*/
- /* METAL_WIP: This isn't used and will require more changes to enable. */
- PSO_SPECIALISED,
+ PSO_SPECIALIZED_SHADE,
PSO_NUM
};
-const char *kernel_type_as_string(int kernel_type);
+const char *kernel_type_as_string(MetalPipelineType pso_type);
struct MetalKernelPipeline {
void compile();
id<MTLLibrary> mtlLibrary = nil;
- bool scene_specialized;
+ MetalPipelineType pso_type;
string source_md5;
+ size_t usage_count = 0;
+ KernelData kernel_data_;
bool use_metalrt;
bool metalrt_hair;
bool metalrt_hair_thick;
@@ -75,6 +80,8 @@ struct MetalKernelPipeline {
id<MTLComputePipelineState> pipeline = nil;
int num_threads_per_block = 0;
+ bool should_use_binary_archive() const;
+
string error_str;
API_AVAILABLE(macos(11.0))
@@ -85,7 +92,8 @@ struct MetalKernelPipeline {
/* Cache of Metal kernels for each DeviceKernel. */
namespace MetalDeviceKernels {
-bool load(MetalDevice *device, bool scene_specialized);
+bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
+bool load(MetalDevice *device, MetalPipelineType pso_type);
const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
} /* namespace MetalDeviceKernels */
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index fec4cd80466..385cb412b06 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -5,6 +5,7 @@
# include "device/metal/kernel.h"
# include "device/metal/device_impl.h"
+# include "kernel/device/metal/function_constants.h"
# include "util/md5.h"
# include "util/path.h"
# include "util/tbb.h"
@@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN
/* limit to 2 MTLCompiler instances */
int max_mtlcompiler_threads = 2;
-const char *kernel_type_as_string(int kernel_type)
+const char *kernel_type_as_string(MetalPipelineType pso_type)
{
- switch (kernel_type) {
+ switch (pso_type) {
case PSO_GENERIC:
return "PSO_GENERIC";
- case PSO_SPECIALISED:
- return "PSO_SPECIALISED";
+ case PSO_SPECIALIZED_INTERSECT:
+ return "PSO_SPECIALIZED_INTERSECT";
+ case PSO_SPECIALIZED_SHADE:
+ return "PSO_SPECIALIZED_SHADE";
default:
assert(0);
}
@@ -50,7 +53,11 @@ struct ShaderCache {
/* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
* device. */
- void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+ void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
+
+ bool should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type);
void wait_for_all();
@@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index)
}
}
-void ShaderCache::load_kernel(DeviceKernel device_kernel,
- MetalDevice *device,
- bool scene_specialized)
+bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
{
- {
- /* create compiler threads on first run */
- thread_scoped_lock lock(cache_mutex);
- if (compile_threads.empty()) {
- running = true;
- for (int i = 0; i < max_mtlcompiler_threads; i++) {
- compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
- }
- }
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ /* Skip megakernel. */
+ return false;
}
- if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- /* skip megakernel */
- return;
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
+ /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
+ return false;
+ }
}
- if (scene_specialized) {
+ if (pso_type != PSO_GENERIC) {
/* Only specialize kernels where it can make an impact. */
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- return;
+ return false;
+ }
+
+ /* Only specialize shading / intersection kernels as requested. */
+ bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
+ if (is_shade_pso != is_shade_kernel) {
+ return false;
}
}
@@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
/* check whether the kernel has already been requested / cached */
thread_scoped_lock lock(cache_mutex);
for (auto &pipeline : pipelines[device_kernel]) {
- if (scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
- /* we already requested a pipeline that is specialized for this kernel data */
- metal_printf("Specialized kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ if (pipeline->source_md5 == device->source_md5[pso_type]) {
+ return false;
}
- else {
- if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
- /* we already requested a generic pipeline for this kernel */
- metal_printf("Generic kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ }
+ }
+
+ return true;
+}
+
+void ShaderCache::load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
+{
+ {
+ /* create compiler threads on first run */
+ thread_scoped_lock lock(cache_mutex);
+ if (compile_threads.empty()) {
+ running = true;
+ for (int i = 0; i < max_mtlcompiler_threads; i++) {
+ compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
}
}
}
+ if (!should_load_kernel(device_kernel, device, pso_type)) {
+ return;
+ }
+
incomplete_requests++;
PipelineRequest request;
request.pipeline = new MetalKernelPipeline;
- request.pipeline->scene_specialized = scene_specialized;
+ memcpy(&request.pipeline->kernel_data_,
+ &device->launch_params.data,
+ sizeof(request.pipeline->kernel_data_));
+ request.pipeline->pso_type = pso_type;
request.pipeline->mtlDevice = mtlDevice;
- request.pipeline->source_md5 =
- device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
- request.pipeline->mtlLibrary =
- device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+ request.pipeline->source_md5 = device->source_md5[pso_type];
+ request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
@@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
{
thread_scoped_lock lock(cache_mutex);
- pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+ auto &collection = pipelines[device_kernel];
+
+ /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
+ int max_entries_of_same_pso_type = 3;
+ for (int i = (int)collection.size() - 1; i >= 0; i--) {
+ if (collection[i]->pso_type == pso_type) {
+ max_entries_of_same_pso_type -= 1;
+ if (max_entries_of_same_pso_type == 0) {
+ metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+ kernel_type_as_string(pso_type),
+ device_kernel_as_string(device_kernel));
+ collection.erase(collection.begin() + i);
+ break;
+ }
+ }
+ }
+
+ collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
request_queue.push_back(request);
}
cond_var.notify_one();
@@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
continue;
}
- if (pipeline->scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+ if (pipeline->pso_type != PSO_GENERIC) {
+ if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+ pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
best_pipeline = pipeline.get();
}
}
@@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
}
}
+ if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
+ metal_printf("Swapping in %s version of %s\n",
+ kernel_type_as_string(best_pipeline->pso_type),
+ device_kernel_as_string(kernel));
+ }
+ best_pipeline->usage_count += 1;
+
return best_pipeline;
}
-void MetalKernelPipeline::compile()
+bool MetalKernelPipeline::should_use_binary_archive() const
{
- int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
+ if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+ if (atoi(str) != 0) {
+ /* Don't archive if we have opted out by env var. */
+ return false;
+ }
+ }
+
+ if (pso_type == PSO_GENERIC) {
+ /* Archive the generic kernels. */
+ return true;
+ }
+
+ if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+ device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ /* Archive all shade kernels - they take a long time to compile. */
+ return true;
+ }
+
+ /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
+ * but will be quick to regenerate if not. */
+ return false;
+}
+
+static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
+{
+ MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
+
+ MTLDataType MTLDataType_int = MTLDataTypeInt;
+ MTLDataType MTLDataType_float = MTLDataTypeFloat;
+ MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
+ KernelData zero_data = {0};
+ if (!data) {
+ data = &zero_data;
+ }
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ [constant_values setConstantValue:&data->parent.name \
+ type:MTLDataType_##_type \
+ atIndex:KernelData_##parent##_##name];
+
+# include "kernel/data_template.h"
+
+ return constant_values;
+}
+
+void MetalKernelPipeline::compile()
+{
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string(device_kernel);
@@ -281,6 +371,17 @@ void MetalKernelPipeline::compile()
if (@available(macOS 11.0, *)) {
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
func_desc.name = entryPoint;
+
+ if (pso_type == PSO_SPECIALIZED_SHADE) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else if (pso_type == PSO_SPECIALIZED_INTERSECT) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else {
+ func_desc.constantValues = GetConstantValues();
+ }
+
function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
}
@@ -427,10 +528,7 @@ void MetalKernelPipeline::compile()
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
- bool use_binary_archive = true;
- if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
- use_binary_archive = (atoi(str) == 0);
- }
+ bool use_binary_archive = should_use_binary_archive();
id<MTLBinaryArchive> archive = nil;
string metalbin_path;
@@ -608,19 +706,32 @@ void MetalKernelPipeline::compile()
}
}
-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
{
+ const double starttime = time_dt();
auto shader_cache = get_shader_cache(device->mtlDevice);
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
- shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+ shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
}
- if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
- shader_cache->wait_for_all();
- }
+ shader_cache->wait_for_all();
+ metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
return true;
}
+bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
+{
+ auto shader_cache = get_shader_cache(device->mtlDevice);
+ for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+ if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+ return true;
+ }
+ }
+ return false;
+}
+
const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
DeviceKernel kernel)
{
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index b0bd487c86d..fc32740f3e1 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states() const override;
+ virtual int num_sort_partition_elements() const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 0e260886abb..5ac63a16c61 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -293,6 +293,11 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
return result;
}
+int MetalDeviceQueue::num_sort_partition_elements() const
+{
+ return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
+}
+
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
@@ -311,8 +316,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
return false;
}
- VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
- << work_size;
+ VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
+ << work_size;
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
@@ -358,8 +363,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
/* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
- size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) +
- sizeof(IntegratorStateGPU);
+ size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
(uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
@@ -415,8 +420,8 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
- const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
- sizeof(IntegratorStateGPU);
+ const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
int pointer_index = int(offset / sizeof(device_ptr));
MetalDevice::MetalMem *mmem = *(
@@ -550,7 +555,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Enhanced command buffer errors are only available in 11.0+ */
if (@available(macos 11.0, *)) {
if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) {
- printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+ metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]);
NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo
valueForKey:MTLCommandBufferEncoderInfoErrorKey];
if (encoderInfos != nil) {
@@ -564,7 +569,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
}
}
else if (command_buffer.error) {
- printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
+ metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]);
}
}
}];
diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h
index f728967835d..a988d01d361 100644
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -25,10 +25,20 @@ enum MetalGPUVendor {
METAL_GPU_INTEL = 3,
};
+enum AppleGPUArchitecture {
+ APPLE_UNKNOWN,
+ APPLE_M1,
+ APPLE_M2,
+};
+
/* Contains static Metal helper functions. */
struct MetalInfo {
static vector<id<MTLDevice>> const &get_usable_devices();
- static MetalGPUVendor get_vendor_from_device_name(string const &device_name);
+ static int get_apple_gpu_core_count(id<MTLDevice> device);
+ static MetalGPUVendor get_device_vendor(id<MTLDevice> device);
+ static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device);
+ static int optimal_sort_partition_elements(id<MTLDevice> device);
+ static string get_device_name(id<MTLDevice> device);
};
/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index a6bd593bcb6..65c67c400fe 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -10,26 +10,83 @@
# include "util/string.h"
# include "util/time.h"
+# include <IOKit/IOKitLib.h>
# include <pwd.h>
# include <sys/shm.h>
# include <time.h>
CCL_NAMESPACE_BEGIN
-MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name)
+string MetalInfo::get_device_name(id<MTLDevice> device)
{
- if (device_name.find("Intel") != string::npos) {
+ string device_name = [device.name UTF8String];
+ if (get_device_vendor(device) == METAL_GPU_APPLE) {
+ /* Append the GPU core count so we can distinguish between GPU variants in benchmarks. */
+ int gpu_core_count = get_apple_gpu_core_count(device);
+ device_name += string_printf(gpu_core_count ? " (GPU - %d cores)" : " (GPU)", gpu_core_count);
+ }
+ return device_name;
+}
+
+int MetalInfo::get_apple_gpu_core_count(id<MTLDevice> device)
+{
+ int core_count = 0;
+ if (@available(macos 12.0, *)) {
+ io_service_t gpu_service = IOServiceGetMatchingService(
+ kIOMainPortDefault, IORegistryEntryIDMatching(device.registryID));
+ if (CFNumberRef numberRef = (CFNumberRef)IORegistryEntryCreateCFProperty(
+ gpu_service, CFSTR("gpu-core-count"), 0, 0)) {
+ if (CFGetTypeID(numberRef) == CFNumberGetTypeID()) {
+ CFNumberGetValue(numberRef, kCFNumberSInt32Type, &core_count);
+ }
+ CFRelease(numberRef);
+ }
+ }
+ return core_count;
+}
+
+AppleGPUArchitecture MetalInfo::get_apple_gpu_architecture(id<MTLDevice> device)
+{
+ const char *device_name = [device.name UTF8String];
+ if (strstr(device_name, "M1")) {
+ return APPLE_M1;
+ }
+ else if (strstr(device_name, "M2")) {
+ return APPLE_M2;
+ }
+ return APPLE_UNKNOWN;
+}
+
+MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
+{
+ const char *device_name = [device.name UTF8String];
+ if (strstr(device_name, "Intel")) {
return METAL_GPU_INTEL;
}
- else if (device_name.find("AMD") != string::npos) {
+ else if (strstr(device_name, "AMD")) {
return METAL_GPU_AMD;
}
- else if (device_name.find("Apple") != string::npos) {
+ else if (strstr(device_name, "Apple")) {
return METAL_GPU_APPLE;
}
return METAL_GPU_UNKNOWN;
}
+int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
+{
+ if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
+ return atoi(str);
+ }
+
+ /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
+ * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
+ * overall render time speedup of up to 15%. */
+ if (get_device_vendor(device) == METAL_GPU_APPLE) {
+ return 65536;
+ }
+ return 0;
+}
+
vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
{
static vector<id<MTLDevice>> usable_devices;
@@ -41,9 +98,8 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
metal_printf("Usable Metal devices:\n");
for (id<MTLDevice> device in MTLCopyAllDevices()) {
- const char *device_name = [device.name UTF8String];
-
- MetalGPUVendor vendor = get_vendor_from_device_name(device_name);
+ string device_name = get_device_name(device);
+ MetalGPUVendor vendor = get_device_vendor(device);
bool usable = false;
if (@available(macos 12.2, *)) {
@@ -55,12 +111,12 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
}
if (usable) {
- metal_printf("- %s\n", device_name);
+ metal_printf("- %s\n", device_name.c_str());
[device retain];
usable_devices.push_back(device);
}
else {
- metal_printf(" (skipping \"%s\")\n", device_name);
+ metal_printf(" (skipping \"%s\")\n", device_name.c_str());
}
}
if (usable_devices.empty()) {