Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/device/device.h6
-rw-r--r--intern/cycles/device/metal/device_impl.h11
-rw-r--r--intern/cycles/device/metal/device_impl.mm193
-rw-r--r--intern/cycles/device/metal/kernel.h30
-rw-r--r--intern/cycles/device/metal/kernel.mm221
-rw-r--r--intern/cycles/kernel/CMakeLists.txt1
-rw-r--r--intern/cycles/kernel/device/metal/function_constants.h14
-rw-r--r--intern/cycles/kernel/device/metal/kernel.metal1
-rw-r--r--intern/cycles/kernel/svm/svm.h9
-rw-r--r--intern/cycles/kernel/types.h8
-rw-r--r--intern/cycles/scene/scene.cpp2
-rw-r--r--intern/cycles/util/string.cpp18
-rw-r--r--intern/cycles/util/string.h2
13 files changed, 394 insertions, 122 deletions
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 340be85e853..e7916ec3a52 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -29,6 +29,7 @@ class DeviceQueue;
class Progress;
class CPUKernels;
class CPUKernelThreadGlobals;
+class Scene;
/* Device Types */
@@ -186,6 +187,11 @@ class Device {
return 0;
}
+ /* Called after kernel texture setup, and prior to integrator state setup. */
+ virtual void optimize_for_scene(Scene *scene)
+ {
+ }
+
virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
{
/* Memory is always resident if this is not a multi device, regardless of whether the pointer
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 4aea8d697a5..99e60d3a788 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -75,7 +75,8 @@ class MetalDevice : public Device {
std::vector<id<MTLTexture>> texture_slot_map;
bool use_metalrt = false;
- bool use_function_specialisation = false;
+ MetalPipelineType kernel_specialization_level = PSO_GENERIC;
+ std::atomic_bool async_compile_and_load = false;
virtual BVHLayoutMask get_bvh_layout_mask() const override;
@@ -91,9 +92,7 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
- string get_source(const uint kernel_features);
-
- string compile_kernel(const uint kernel_features, const char *name);
+ void make_source(MetalPipelineType pso_type, const uint kernel_features);
virtual bool load_kernels(const uint kernel_features) override;
@@ -111,7 +110,9 @@ class MetalDevice : public Device {
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
- id<MTLLibrary> compile(string const &source);
+ virtual void optimize_for_scene(Scene *scene) override;
+
+ bool compile_and_load(MetalPipelineType pso_type);
/* ------------------------------------------------------------------ */
/* low-level memory management */
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index ba9317e3204..d8bb3b867cd 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -6,6 +6,8 @@
# include "device/metal/device_impl.h"
# include "device/metal/device.h"
+# include "scene/scene.h"
+
# include "util/debug.h"
# include "util/md5.h"
# include "util/path.h"
@@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
case METAL_GPU_APPLE: {
max_threads_per_threadgroup = 512;
use_metalrt = info.use_metalrt;
+
+ /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
+ * quickly. */
+ kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
break;
}
}
@@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
capture_enabled = true;
}
+ if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
+ kernel_specialization_level = (MetalPipelineType)atoi(envstr);
+ }
+ metal_printf("kernel_specialization_level = %s\n",
+ kernel_type_as_string(
+ (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
+
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
arg_desc_params.dataType = MTLDataTypePointer;
arg_desc_params.access = MTLArgumentAccessReadOnly;
@@ -209,12 +222,11 @@ bool MetalDevice::use_adaptive_compilation()
return DebugFlags().metal.adaptive_compile;
}
-string MetalDevice::get_source(const uint kernel_features)
+void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
- string build_options;
-
+ string global_defines;
if (use_adaptive_compilation()) {
- build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+ global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
}
if (MetalInfo::optimal_sort_partition_elements(mtlDevice) > 0) {
@@ -222,52 +234,78 @@ string MetalDevice::get_source(const uint kernel_features)
}
if (use_metalrt) {
- build_options += "-D__METALRT__ ";
+ global_defines += "#define __METALRT__\n";
if (motion_blur) {
- build_options += "-D__METALRT_MOTION__ ";
+ global_defines += "#define __METALRT_MOTION__\n";
}
}
# ifdef WITH_CYCLES_DEBUG
- build_options += "-D__KERNEL_DEBUG__ ";
+ global_defines += "#define __KERNEL_DEBUG__\n";
# endif
switch (device_vendor) {
default:
break;
case METAL_GPU_INTEL:
- build_options += "-D__KERNEL_METAL_INTEL__ ";
+ global_defines += "#define __KERNEL_METAL_INTEL__\n";
break;
case METAL_GPU_AMD:
- build_options += "-D__KERNEL_METAL_AMD__ ";
+ global_defines += "#define __KERNEL_METAL_AMD__\n";
break;
case METAL_GPU_APPLE:
- build_options += "-D__KERNEL_METAL_APPLE__ ";
+ global_defines += "#define __KERNEL_METAL_APPLE__\n";
break;
}
- /* reformat -D defines list into compilable form */
- vector<string> components;
- string_replace(build_options, "-D", "");
- string_split(components, build_options, " ");
+ string &source = this->source[pso_type];
+ source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
+ source = path_source_replace_includes(source, path_get("source"));
- string globalDefines;
- for (const string &component : components) {
- vector<string> assignments;
- string_split(assignments, component, "=");
- if (assignments.size() == 2)
- globalDefines += string_printf(
- "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str());
- else
- globalDefines += string_printf("#define %s\n", assignments[0].c_str());
+ /* Perform any required specialization on the source.
+ * With Metal function constants we can generate a single variant of the kernel source which can
+ * be repeatedly respecialized.
+ */
+ string baked_constants;
+
+ /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
+ * the same character length. Build a string of all active constant values which is then hashed
+ * in order to identify the PSO.
+ */
+ if (pso_type != PSO_GENERIC) {
+ const double starttime = time_dt();
+
+# define KERNEL_STRUCT_BEGIN(name, parent) \
+ string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
+
+ /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ baked_constants += string(#parent "." #name "=") + \
+ to_string(_type(launch_params.data.parent.name)) + "\n";
+
+# include "kernel/data_template.h"
+
+ /* Opt in to all of available specializations. This can be made more granular for the
+ * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
+ * but the overhead should be negligible as these are very quick to (re)build and aren't
+ * serialized to disk via MTLBinaryArchives.
+ */
+ global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
+
+ metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
}
- string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n";
- source = path_source_replace_includes(source, path_get("source"));
-
- metal_printf("Global defines:\n%s\n", globalDefines.c_str());
+ source = global_defines + source;
+ metal_printf("================\n%s================\n\%s================\n",
+ global_defines.c_str(),
+ baked_constants.c_str());
- return source;
+ /* Generate an MD5 from the source and include any baked constants. This is used when caching
+ * PSOs. */
+ MD5Hash md5;
+ md5.append(baked_constants);
+ md5.append(source);
+ source_md5[pso_type] = md5.get_hex();
}
bool MetalDevice::load_kernels(const uint _kernel_features)
@@ -283,28 +321,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
* active, but may still need to be rendered without motion blur if that isn't active as well. */
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
- source[PSO_GENERIC] = get_source(kernel_features);
-
- const double starttime = time_dt();
-
- mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
-
- metal_printf("Front-end compilation finished in %.1f seconds (generic)\n",
- time_dt() - starttime);
-
- MD5Hash md5;
- md5.append(source[PSO_GENERIC]);
- source_md5[PSO_GENERIC] = md5.get_hex();
-
- bool result = MetalDeviceKernels::load(this, false);
+ bool result = compile_and_load(PSO_GENERIC);
reserve_local_memory(kernel_features);
-
return result;
}
-id<MTLLibrary> MetalDevice::compile(string const &source)
+bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
{
+ make_source(pso_type, kernel_features);
+
+ if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
+ /* We already have a full set of matching pipelines which are cached or queued. */
+ metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
+ return true;
+ }
+
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
options.fastMathEnabled = YES;
@@ -312,19 +344,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source)
options.languageVersion = MTLLanguageVersion2_4;
}
+ if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
+ path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
+ source[pso_type]);
+ }
+
+ const double starttime = time_dt();
+
NSError *error = NULL;
- id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
- options:options
- error:&error];
+ mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
+ options:options
+ error:&error];
- if (!mtlLibrary) {
+ if (!mtlLibrary[pso_type]) {
NSString *err = [error localizedDescription];
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
}
+ metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
+
[options release];
- return mtlLibrary;
+ return MetalDeviceKernels::load(this, pso_type);
}
void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -631,6 +674,58 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
return 0;
}
+void MetalDevice::optimize_for_scene(Scene *scene)
+{
+ MetalPipelineType specialization_level = kernel_specialization_level;
+
+ if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
+ return;
+ }
+
+ /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
+ * synchronously. */
+ compile_and_load(PSO_SPECIALIZED_INTERSECT);
+
+ if (specialization_level < PSO_SPECIALIZED_SHADE) {
+ return;
+ }
+ if (!scene->params.background) {
+ /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
+ * build. */
+ return;
+ }
+
+ /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
+ * only if there isn't an existing load in flight.
+ */
+ auto specialize_shade_fn = ^() {
+ compile_and_load(PSO_SPECIALIZED_SHADE);
+ async_compile_and_load = false;
+ };
+
+ bool async_specialize_shade = true;
+
+ /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
+ if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
+ async_specialize_shade = false;
+ }
+
+ if (async_specialize_shade) {
+ if (!async_compile_and_load) {
+ async_compile_and_load = true;
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
+ specialize_shade_fn);
+ }
+ else {
+ metal_printf(
+ "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
+ }
+ }
+ else {
+ specialize_shade_fn();
+ }
+}
+
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
{
if (strcmp(name, "data") == 0) {
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index 69b2a686ecc..11393f8b7e1 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -31,7 +31,7 @@ enum {
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
/* Pipeline State Object types */
-enum {
+enum MetalPipelineType {
/* A kernel that can be used with all scenes, supporting all features.
* It is slow to compile, but only needs to be compiled once and is then
* cached for future render sessions. This allows a render to get underway
@@ -39,28 +39,33 @@ enum {
*/
PSO_GENERIC,
- /* A kernel that is relatively quick to compile, but is specialized for the
- * scene being rendered. It only contains the functionality and even baked in
- * constants for values that means it needs to be recompiled whenever a
- * dependent setting is changed. The render performance of this kernel is
- * significantly faster though, and justifies the extra compile time.
+ /* A intersection kernel that is very quick to specialize and results in faster intersection
+ * kernel performance. It uses Metal function constants to replace several KernelData variables
+ * with fixed constants.
+ */
+ PSO_SPECIALIZED_INTERSECT,
+
+ /* A shading kernel that is slow to specialize, but results in faster shading kernel performance
+ * rendered. It uses Metal function constants to replace several KernelData variables with fixed
+ * constants and short-circuit all unused SVM node case handlers.
*/
- /* METAL_WIP: This isn't used and will require more changes to enable. */
- PSO_SPECIALISED,
+ PSO_SPECIALIZED_SHADE,
PSO_NUM
};
-const char *kernel_type_as_string(int kernel_type);
+const char *kernel_type_as_string(MetalPipelineType pso_type);
struct MetalKernelPipeline {
void compile();
id<MTLLibrary> mtlLibrary = nil;
- bool scene_specialized;
+ MetalPipelineType pso_type;
string source_md5;
+ size_t usage_count = 0;
+ KernelData kernel_data_;
bool use_metalrt;
bool metalrt_hair;
bool metalrt_hair_thick;
@@ -75,6 +80,8 @@ struct MetalKernelPipeline {
id<MTLComputePipelineState> pipeline = nil;
int num_threads_per_block = 0;
+ bool should_use_binary_archive() const;
+
string error_str;
API_AVAILABLE(macos(11.0))
@@ -85,7 +92,8 @@ struct MetalKernelPipeline {
/* Cache of Metal kernels for each DeviceKernel. */
namespace MetalDeviceKernels {
-bool load(MetalDevice *device, bool scene_specialized);
+bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
+bool load(MetalDevice *device, MetalPipelineType pso_type);
const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
} /* namespace MetalDeviceKernels */
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index fec4cd80466..385cb412b06 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -5,6 +5,7 @@
# include "device/metal/kernel.h"
# include "device/metal/device_impl.h"
+# include "kernel/device/metal/function_constants.h"
# include "util/md5.h"
# include "util/path.h"
# include "util/tbb.h"
@@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN
/* limit to 2 MTLCompiler instances */
int max_mtlcompiler_threads = 2;
-const char *kernel_type_as_string(int kernel_type)
+const char *kernel_type_as_string(MetalPipelineType pso_type)
{
- switch (kernel_type) {
+ switch (pso_type) {
case PSO_GENERIC:
return "PSO_GENERIC";
- case PSO_SPECIALISED:
- return "PSO_SPECIALISED";
+ case PSO_SPECIALIZED_INTERSECT:
+ return "PSO_SPECIALIZED_INTERSECT";
+ case PSO_SPECIALIZED_SHADE:
+ return "PSO_SPECIALIZED_SHADE";
default:
assert(0);
}
@@ -50,7 +53,11 @@ struct ShaderCache {
/* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
* device. */
- void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+ void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
+
+ bool should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type);
void wait_for_all();
@@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index)
}
}
-void ShaderCache::load_kernel(DeviceKernel device_kernel,
- MetalDevice *device,
- bool scene_specialized)
+bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
{
- {
- /* create compiler threads on first run */
- thread_scoped_lock lock(cache_mutex);
- if (compile_threads.empty()) {
- running = true;
- for (int i = 0; i < max_mtlcompiler_threads; i++) {
- compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
- }
- }
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ /* Skip megakernel. */
+ return false;
}
- if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- /* skip megakernel */
- return;
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
+ /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
+ return false;
+ }
}
- if (scene_specialized) {
+ if (pso_type != PSO_GENERIC) {
/* Only specialize kernels where it can make an impact. */
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- return;
+ return false;
+ }
+
+ /* Only specialize shading / intersection kernels as requested. */
+ bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
+ if (is_shade_pso != is_shade_kernel) {
+ return false;
}
}
@@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
/* check whether the kernel has already been requested / cached */
thread_scoped_lock lock(cache_mutex);
for (auto &pipeline : pipelines[device_kernel]) {
- if (scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
- /* we already requested a pipeline that is specialized for this kernel data */
- metal_printf("Specialized kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ if (pipeline->source_md5 == device->source_md5[pso_type]) {
+ return false;
}
- else {
- if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
- /* we already requested a generic pipeline for this kernel */
- metal_printf("Generic kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ }
+ }
+
+ return true;
+}
+
+void ShaderCache::load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
+{
+ {
+ /* create compiler threads on first run */
+ thread_scoped_lock lock(cache_mutex);
+ if (compile_threads.empty()) {
+ running = true;
+ for (int i = 0; i < max_mtlcompiler_threads; i++) {
+ compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
}
}
}
+ if (!should_load_kernel(device_kernel, device, pso_type)) {
+ return;
+ }
+
incomplete_requests++;
PipelineRequest request;
request.pipeline = new MetalKernelPipeline;
- request.pipeline->scene_specialized = scene_specialized;
+ memcpy(&request.pipeline->kernel_data_,
+ &device->launch_params.data,
+ sizeof(request.pipeline->kernel_data_));
+ request.pipeline->pso_type = pso_type;
request.pipeline->mtlDevice = mtlDevice;
- request.pipeline->source_md5 =
- device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
- request.pipeline->mtlLibrary =
- device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+ request.pipeline->source_md5 = device->source_md5[pso_type];
+ request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
@@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
{
thread_scoped_lock lock(cache_mutex);
- pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+ auto &collection = pipelines[device_kernel];
+
+ /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
+ int max_entries_of_same_pso_type = 3;
+ for (int i = (int)collection.size() - 1; i >= 0; i--) {
+ if (collection[i]->pso_type == pso_type) {
+ max_entries_of_same_pso_type -= 1;
+ if (max_entries_of_same_pso_type == 0) {
+ metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+ kernel_type_as_string(pso_type),
+ device_kernel_as_string(device_kernel));
+ collection.erase(collection.begin() + i);
+ break;
+ }
+ }
+ }
+
+ collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
request_queue.push_back(request);
}
cond_var.notify_one();
@@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
continue;
}
- if (pipeline->scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+ if (pipeline->pso_type != PSO_GENERIC) {
+ if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+ pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
best_pipeline = pipeline.get();
}
}
@@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
}
}
+ if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
+ metal_printf("Swapping in %s version of %s\n",
+ kernel_type_as_string(best_pipeline->pso_type),
+ device_kernel_as_string(kernel));
+ }
+ best_pipeline->usage_count += 1;
+
return best_pipeline;
}
-void MetalKernelPipeline::compile()
+bool MetalKernelPipeline::should_use_binary_archive() const
{
- int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
+ if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+ if (atoi(str) != 0) {
+ /* Don't archive if we have opted out by env var. */
+ return false;
+ }
+ }
+
+ if (pso_type == PSO_GENERIC) {
+ /* Archive the generic kernels. */
+ return true;
+ }
+
+ if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+ device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ /* Archive all shade kernels - they take a long time to compile. */
+ return true;
+ }
+
+ /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
+ * but will be quick to regenerate if not. */
+ return false;
+}
+
+static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
+{
+ MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
+
+ MTLDataType MTLDataType_int = MTLDataTypeInt;
+ MTLDataType MTLDataType_float = MTLDataTypeFloat;
+ MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
+ KernelData zero_data = {0};
+ if (!data) {
+ data = &zero_data;
+ }
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ [constant_values setConstantValue:&data->parent.name \
+ type:MTLDataType_##_type \
+ atIndex:KernelData_##parent##_##name];
+
+# include "kernel/data_template.h"
+
+ return constant_values;
+}
+
+void MetalKernelPipeline::compile()
+{
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string(device_kernel);
@@ -281,6 +371,17 @@ void MetalKernelPipeline::compile()
if (@available(macOS 11.0, *)) {
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
func_desc.name = entryPoint;
+
+ if (pso_type == PSO_SPECIALIZED_SHADE) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else if (pso_type == PSO_SPECIALIZED_INTERSECT) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else {
+ func_desc.constantValues = GetConstantValues();
+ }
+
function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
}
@@ -427,10 +528,7 @@ void MetalKernelPipeline::compile()
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
- bool use_binary_archive = true;
- if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
- use_binary_archive = (atoi(str) == 0);
- }
+ bool use_binary_archive = should_use_binary_archive();
id<MTLBinaryArchive> archive = nil;
string metalbin_path;
@@ -608,19 +706,32 @@ void MetalKernelPipeline::compile()
}
}
-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
{
+ const double starttime = time_dt();
auto shader_cache = get_shader_cache(device->mtlDevice);
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
- shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+ shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
}
- if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
- shader_cache->wait_for_all();
- }
+ shader_cache->wait_for_all();
+ metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
return true;
}
+bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
+{
+ auto shader_cache = get_shader_cache(device->mtlDevice);
+ for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+ if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+ return true;
+ }
+ }
+ return false;
+}
+
const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
DeviceKernel kernel)
{
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 527cc4ec111..21a78722c0d 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -79,6 +79,7 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS
device/metal/compat.h
device/metal/context_begin.h
device/metal/context_end.h
+ device/metal/function_constants.h
device/metal/globals.h
)
diff --git a/intern/cycles/kernel/device/metal/function_constants.h b/intern/cycles/kernel/device/metal/function_constants.h
new file mode 100644
index 00000000000..f4001735672
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/function_constants.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+enum {
+#define KERNEL_STRUCT_MEMBER(parent, type, name) KernelData_##parent##_##name,
+#include "kernel/data_template.h"
+};
+
+#ifdef __KERNEL_METAL__
+# define KERNEL_STRUCT_MEMBER(parent, type, name) \
+ constant type kernel_data_##parent##_##name \
+ [[function_constant(KernelData_##parent##_##name)]];
+# include "kernel/data_template.h"
+#endif
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
index 3c31dc3354c..74b4b079a32 100644
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -5,6 +5,7 @@
#include "kernel/device/metal/compat.h"
#include "kernel/device/metal/globals.h"
+#include "kernel/device/metal/function_constants.h"
#include "kernel/device/gpu/kernel.h"
/* MetalRT intersection handlers */
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 9840cda3655..9d6d3e9222c 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -204,7 +204,14 @@ CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN
-#define SVM_CASE(node) case node:
+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+# define SVM_CASE(node) \
+ case node: \
+ if (!kernel_data_svm_usage_##node) \
+ break;
+#else
+# define SVM_CASE(node) case node:
+#endif
/* Main Interpreter Loop */
template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 72cee6ae344..62ac75e5e4d 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1136,7 +1136,13 @@ typedef enum KernelBVHLayout {
} \
; \
static_assert_align(name, 16);
-#define KERNEL_STRUCT_MEMBER(parent, type, name) type name;
+
+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+# define KERNEL_STRUCT_MEMBER(parent, type, name) type __unused_##name;
+#else
+# define KERNEL_STRUCT_MEMBER(parent, type, name) type name;
+#endif
+
#include "kernel/data_template.h"
typedef struct KernelTables {
diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp
index eedb2a4fa3a..18cd665ac74 100644
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -369,6 +369,8 @@ void Scene::device_update(Device *device_, Progress &progress)
device->const_copy_to("data", &dscene.data, sizeof(dscene.data));
}
+ device->optimize_for_scene(this);
+
if (print_stats) {
size_t mem_used = util_guarded_get_mem_used();
size_t mem_peak = util_guarded_get_mem_peak();
diff --git a/intern/cycles/util/string.cpp b/intern/cycles/util/string.cpp
index 66ff866ee10..0c318cea44a 100644
--- a/intern/cycles/util/string.cpp
+++ b/intern/cycles/util/string.cpp
@@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other)
}
}
+void string_replace_same_length(string &haystack, const string &needle, const string &other)
+{
+ assert(needle.size() == other.size());
+ size_t pos = 0;
+ while (pos != string::npos) {
+ pos = haystack.find(needle, pos);
+ if (pos != string::npos) {
+ memcpy(haystack.data() + pos, other.data(), other.size());
+ pos += other.size();
+ }
+ }
+}
+
string string_remove_trademark(const string &s)
{
string result = s;
@@ -164,6 +177,11 @@ string to_string(const char *str)
return string(str);
}
+string to_string(const float4 &v)
+{
+ return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w);
+}
+
string string_to_lower(const string &s)
{
string r = s;
diff --git a/intern/cycles/util/string.h b/intern/cycles/util/string.h
index a74feee1750..ecbe9e106c6 100644
--- a/intern/cycles/util/string.h
+++ b/intern/cycles/util/string.h
@@ -38,12 +38,14 @@ void string_split(vector<string> &tokens,
const string &separators = "\t ",
bool skip_empty_tokens = true);
void string_replace(string &haystack, const string &needle, const string &other);
+void string_replace_same_length(string &haystack, const string &needle, const string &other);
bool string_startswith(string_view s, string_view start);
bool string_endswith(string_view s, string_view end);
string string_strip(const string &s);
string string_remove_trademark(const string &s);
string string_from_bool(const bool var);
string to_string(const char *str);
+string to_string(const float4 &v);
string string_to_lower(const string &s);
/* Wide char strings are only used on Windows to deal with non-ASCII