Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/cpu/device_impl.cpp2
-rw-r--r--intern/cycles/device/device.h6
-rw-r--r--intern/cycles/device/metal/device_impl.h11
-rw-r--r--intern/cycles/device/metal/device_impl.mm195
-rw-r--r--intern/cycles/device/metal/kernel.h30
-rw-r--r--intern/cycles/device/metal/kernel.mm221
-rw-r--r--intern/cycles/device/metal/queue.h1
-rw-r--r--intern/cycles/device/metal/queue.mm9
-rw-r--r--intern/cycles/device/metal/util.h1
-rw-r--r--intern/cycles/device/metal/util.mm15
-rw-r--r--intern/cycles/device/oneapi/device.cpp6
-rw-r--r--intern/cycles/device/oneapi/device_impl.cpp4
-rw-r--r--intern/cycles/device/oneapi/dll_interface.h2
-rw-r--r--intern/cycles/device/oneapi/queue.h2
-rw-r--r--intern/cycles/device/optix/device_impl.cpp3
-rw-r--r--intern/cycles/device/optix/queue.cpp1
-rw-r--r--intern/cycles/device/queue.h7
17 files changed, 384 insertions, 132 deletions
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index d4f0532aa5e..1e4b9baa0c0 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -197,7 +197,7 @@ void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
// Update scene handle (since it is different for each device on multi devices)
KernelData *const data = (KernelData *)host;
- data->bvh.scene = embree_scene;
+ data->device_bvh = embree_scene;
}
#endif
kernel_const_copy(&kernel_globals, name, host, size);
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 340be85e853..cdb13ca0a97 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -29,6 +29,7 @@ class DeviceQueue;
class Progress;
class CPUKernels;
class CPUKernelThreadGlobals;
+class Scene;
/* Device Types */
@@ -186,6 +187,11 @@ class Device {
return 0;
}
+ /* Called after kernel texture setup, and prior to integrator state setup. */
+ virtual void optimize_for_scene(Scene * /*scene*/)
+ {
+ }
+
virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
{
/* Memory is always resident if this is not a multi device, regardless of whether the pointer
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 4aea8d697a5..99e60d3a788 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -75,7 +75,8 @@ class MetalDevice : public Device {
std::vector<id<MTLTexture>> texture_slot_map;
bool use_metalrt = false;
- bool use_function_specialisation = false;
+ MetalPipelineType kernel_specialization_level = PSO_GENERIC;
+ std::atomic_bool async_compile_and_load = false;
virtual BVHLayoutMask get_bvh_layout_mask() const override;
@@ -91,9 +92,7 @@ class MetalDevice : public Device {
bool use_adaptive_compilation();
- string get_source(const uint kernel_features);
-
- string compile_kernel(const uint kernel_features, const char *name);
+ void make_source(MetalPipelineType pso_type, const uint kernel_features);
virtual bool load_kernels(const uint kernel_features) override;
@@ -111,7 +110,9 @@ class MetalDevice : public Device {
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
- id<MTLLibrary> compile(string const &source);
+ virtual void optimize_for_scene(Scene *scene) override;
+
+ bool compile_and_load(MetalPipelineType pso_type);
/* ------------------------------------------------------------------ */
/* low-level memory management */
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 87c83242240..d1250b83d22 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -6,6 +6,8 @@
# include "device/metal/device_impl.h"
# include "device/metal/device.h"
+# include "scene/scene.h"
+
# include "util/debug.h"
# include "util/md5.h"
# include "util/path.h"
@@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
case METAL_GPU_APPLE: {
max_threads_per_threadgroup = 512;
use_metalrt = info.use_metalrt;
+
+ /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
+ * quickly. */
+ kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
break;
}
}
@@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
capture_enabled = true;
}
+ if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
+ kernel_specialization_level = (MetalPipelineType)atoi(envstr);
+ }
+ metal_printf("kernel_specialization_level = %s\n",
+ kernel_type_as_string(
+ (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
+
MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
arg_desc_params.dataType = MTLDataTypePointer;
arg_desc_params.access = MTLArgumentAccessReadOnly;
@@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation()
return DebugFlags().metal.adaptive_compile;
}
-string MetalDevice::get_source(const uint kernel_features)
+void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
{
- string build_options;
-
+ string global_defines;
if (use_adaptive_compilation()) {
- build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+ global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
}
if (use_metalrt) {
- build_options += "-D__METALRT__ ";
+ global_defines += "#define __METALRT__\n";
if (motion_blur) {
- build_options += "-D__METALRT_MOTION__ ";
+ global_defines += "#define __METALRT_MOTION__\n";
}
}
# ifdef WITH_CYCLES_DEBUG
- build_options += "-D__KERNEL_DEBUG__ ";
+ global_defines += "#define __KERNEL_DEBUG__\n";
# endif
switch (device_vendor) {
default:
break;
case METAL_GPU_INTEL:
- build_options += "-D__KERNEL_METAL_INTEL__ ";
+ global_defines += "#define __KERNEL_METAL_INTEL__\n";
break;
case METAL_GPU_AMD:
- build_options += "-D__KERNEL_METAL_AMD__ ";
+ global_defines += "#define __KERNEL_METAL_AMD__\n";
break;
case METAL_GPU_APPLE:
- build_options += "-D__KERNEL_METAL_APPLE__ ";
+ global_defines += "#define __KERNEL_METAL_APPLE__\n";
break;
}
- /* reformat -D defines list into compilable form */
- vector<string> components;
- string_replace(build_options, "-D", "");
- string_split(components, build_options, " ");
+ string &source = this->source[pso_type];
+ source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
+ source = path_source_replace_includes(source, path_get("source"));
- string globalDefines;
- for (const string &component : components) {
- vector<string> assignments;
- string_split(assignments, component, "=");
- if (assignments.size() == 2)
- globalDefines += string_printf(
- "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str());
- else
- globalDefines += string_printf("#define %s\n", assignments[0].c_str());
+ /* Perform any required specialization on the source.
+ * With Metal function constants we can generate a single variant of the kernel source which can
+ * be repeatedly respecialized.
+ */
+ string baked_constants;
+
+ /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
+ * the same character length. Build a string of all active constant values which is then hashed
+ * in order to identify the PSO.
+ */
+ if (pso_type != PSO_GENERIC) {
+ const double starttime = time_dt();
+
+# define KERNEL_STRUCT_BEGIN(name, parent) \
+ string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
+
+ /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ baked_constants += string(#parent "." #name "=") + \
+ to_string(_type(launch_params.data.parent.name)) + "\n";
+
+# include "kernel/data_template.h"
+
+ /* Opt in to all of available specializations. This can be made more granular for the
+ * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
+ * but the overhead should be negligible as these are very quick to (re)build and aren't
+ * serialized to disk via MTLBinaryArchives.
+ */
+ global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
+
+ metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
}
- string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n";
- source = path_source_replace_includes(source, path_get("source"));
-
- metal_printf("Global defines:\n%s\n", globalDefines.c_str());
+ source = global_defines + source;
+ metal_printf("================\n%s================\n\%s================\n",
+ global_defines.c_str(),
+ baked_constants.c_str());
- return source;
+ /* Generate an MD5 from the source and include any baked constants. This is used when caching
+ * PSOs. */
+ MD5Hash md5;
+ md5.append(baked_constants);
+ md5.append(source);
+ source_md5[pso_type] = md5.get_hex();
}
bool MetalDevice::load_kernels(const uint _kernel_features)
@@ -279,28 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
* active, but may still need to be rendered without motion blur if that isn't active as well. */
motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
- source[PSO_GENERIC] = get_source(kernel_features);
-
- const double starttime = time_dt();
-
- mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
-
- metal_printf("Front-end compilation finished in %.1f seconds (generic)\n",
- time_dt() - starttime);
-
- MD5Hash md5;
- md5.append(source[PSO_GENERIC]);
- source_md5[PSO_GENERIC] = md5.get_hex();
-
- bool result = MetalDeviceKernels::load(this, false);
+ bool result = compile_and_load(PSO_GENERIC);
reserve_local_memory(kernel_features);
-
return result;
}
-id<MTLLibrary> MetalDevice::compile(string const &source)
+bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
{
+ make_source(pso_type, kernel_features);
+
+ if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
+ /* We already have a full set of matching pipelines which are cached or queued. */
+ metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
+ return true;
+ }
+
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
options.fastMathEnabled = YES;
@@ -308,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source)
options.languageVersion = MTLLanguageVersion2_4;
}
+ if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
+ path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
+ source[pso_type]);
+ }
+
+ const double starttime = time_dt();
+
NSError *error = NULL;
- id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
- options:options
- error:&error];
+ mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
+ options:options
+ error:&error];
- if (!mtlLibrary) {
+ if (!mtlLibrary[pso_type]) {
NSString *err = [error localizedDescription];
set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
}
+ metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
+
[options release];
- return mtlLibrary;
+ return MetalDeviceKernels::load(this, pso_type);
}
void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -627,6 +670,58 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
return 0;
}
+void MetalDevice::optimize_for_scene(Scene *scene)
+{
+ MetalPipelineType specialization_level = kernel_specialization_level;
+
+ if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
+ return;
+ }
+
+ /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
+ * synchronously. */
+ compile_and_load(PSO_SPECIALIZED_INTERSECT);
+
+ if (specialization_level < PSO_SPECIALIZED_SHADE) {
+ return;
+ }
+ if (!scene->params.background) {
+ /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
+ * build. */
+ return;
+ }
+
+ /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
+ * only if there isn't an existing load in flight.
+ */
+ auto specialize_shade_fn = ^() {
+ compile_and_load(PSO_SPECIALIZED_SHADE);
+ async_compile_and_load = false;
+ };
+
+ bool async_specialize_shade = true;
+
+ /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
+ if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
+ async_specialize_shade = false;
+ }
+
+ if (async_specialize_shade) {
+ if (!async_compile_and_load) {
+ async_compile_and_load = true;
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
+ specialize_shade_fn);
+ }
+ else {
+ metal_printf(
+ "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
+ }
+ }
+ else {
+ specialize_shade_fn();
+ }
+}
+
void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
{
if (strcmp(name, "data") == 0) {
@@ -652,7 +747,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
/* Update data storage pointers in launch parameters. */
if (strcmp(name, "integrator_state") == 0) {
/* IntegratorStateGPU is contiguous pointers */
- const size_t pointer_block_size = sizeof(IntegratorStateGPU);
+ const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
update_launch_pointers(
offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
}
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index 69b2a686ecc..11393f8b7e1 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -31,7 +31,7 @@ enum {
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
/* Pipeline State Object types */
-enum {
+enum MetalPipelineType {
/* A kernel that can be used with all scenes, supporting all features.
* It is slow to compile, but only needs to be compiled once and is then
* cached for future render sessions. This allows a render to get underway
@@ -39,28 +39,33 @@ enum {
*/
PSO_GENERIC,
- /* A kernel that is relatively quick to compile, but is specialized for the
- * scene being rendered. It only contains the functionality and even baked in
- * constants for values that means it needs to be recompiled whenever a
- * dependent setting is changed. The render performance of this kernel is
- * significantly faster though, and justifies the extra compile time.
+ /* A intersection kernel that is very quick to specialize and results in faster intersection
+ * kernel performance. It uses Metal function constants to replace several KernelData variables
+ * with fixed constants.
+ */
+ PSO_SPECIALIZED_INTERSECT,
+
+ /* A shading kernel that is slow to specialize, but results in faster shading kernel performance
+ * rendered. It uses Metal function constants to replace several KernelData variables with fixed
+ * constants and short-circuit all unused SVM node case handlers.
*/
- /* METAL_WIP: This isn't used and will require more changes to enable. */
- PSO_SPECIALISED,
+ PSO_SPECIALIZED_SHADE,
PSO_NUM
};
-const char *kernel_type_as_string(int kernel_type);
+const char *kernel_type_as_string(MetalPipelineType pso_type);
struct MetalKernelPipeline {
void compile();
id<MTLLibrary> mtlLibrary = nil;
- bool scene_specialized;
+ MetalPipelineType pso_type;
string source_md5;
+ size_t usage_count = 0;
+ KernelData kernel_data_;
bool use_metalrt;
bool metalrt_hair;
bool metalrt_hair_thick;
@@ -75,6 +80,8 @@ struct MetalKernelPipeline {
id<MTLComputePipelineState> pipeline = nil;
int num_threads_per_block = 0;
+ bool should_use_binary_archive() const;
+
string error_str;
API_AVAILABLE(macos(11.0))
@@ -85,7 +92,8 @@ struct MetalKernelPipeline {
/* Cache of Metal kernels for each DeviceKernel. */
namespace MetalDeviceKernels {
-bool load(MetalDevice *device, bool scene_specialized);
+bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
+bool load(MetalDevice *device, MetalPipelineType pso_type);
const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
} /* namespace MetalDeviceKernels */
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index fec4cd80466..385cb412b06 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -5,6 +5,7 @@
# include "device/metal/kernel.h"
# include "device/metal/device_impl.h"
+# include "kernel/device/metal/function_constants.h"
# include "util/md5.h"
# include "util/path.h"
# include "util/tbb.h"
@@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN
/* limit to 2 MTLCompiler instances */
int max_mtlcompiler_threads = 2;
-const char *kernel_type_as_string(int kernel_type)
+const char *kernel_type_as_string(MetalPipelineType pso_type)
{
- switch (kernel_type) {
+ switch (pso_type) {
case PSO_GENERIC:
return "PSO_GENERIC";
- case PSO_SPECIALISED:
- return "PSO_SPECIALISED";
+ case PSO_SPECIALIZED_INTERSECT:
+ return "PSO_SPECIALIZED_INTERSECT";
+ case PSO_SPECIALIZED_SHADE:
+ return "PSO_SPECIALIZED_SHADE";
default:
assert(0);
}
@@ -50,7 +53,11 @@ struct ShaderCache {
/* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
* device. */
- void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+ void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
+
+ bool should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type);
void wait_for_all();
@@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index)
}
}
-void ShaderCache::load_kernel(DeviceKernel device_kernel,
- MetalDevice *device,
- bool scene_specialized)
+bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
{
- {
- /* create compiler threads on first run */
- thread_scoped_lock lock(cache_mutex);
- if (compile_threads.empty()) {
- running = true;
- for (int i = 0; i < max_mtlcompiler_threads; i++) {
- compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
- }
- }
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ /* Skip megakernel. */
+ return false;
}
- if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- /* skip megakernel */
- return;
+ if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+ if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
+ /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
+ return false;
+ }
}
- if (scene_specialized) {
+ if (pso_type != PSO_GENERIC) {
/* Only specialize kernels where it can make an impact. */
if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- return;
+ return false;
+ }
+
+ /* Only specialize shading / intersection kernels as requested. */
+ bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+ bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
+ if (is_shade_pso != is_shade_kernel) {
+ return false;
}
}
@@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
/* check whether the kernel has already been requested / cached */
thread_scoped_lock lock(cache_mutex);
for (auto &pipeline : pipelines[device_kernel]) {
- if (scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
- /* we already requested a pipeline that is specialized for this kernel data */
- metal_printf("Specialized kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ if (pipeline->source_md5 == device->source_md5[pso_type]) {
+ return false;
}
- else {
- if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
- /* we already requested a generic pipeline for this kernel */
- metal_printf("Generic kernel already requested (%s)\n",
- device_kernel_as_string(device_kernel));
- return;
- }
+ }
+ }
+
+ return true;
+}
+
+void ShaderCache::load_kernel(DeviceKernel device_kernel,
+ MetalDevice *device,
+ MetalPipelineType pso_type)
+{
+ {
+ /* create compiler threads on first run */
+ thread_scoped_lock lock(cache_mutex);
+ if (compile_threads.empty()) {
+ running = true;
+ for (int i = 0; i < max_mtlcompiler_threads; i++) {
+ compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
}
}
}
+ if (!should_load_kernel(device_kernel, device, pso_type)) {
+ return;
+ }
+
incomplete_requests++;
PipelineRequest request;
request.pipeline = new MetalKernelPipeline;
- request.pipeline->scene_specialized = scene_specialized;
+ memcpy(&request.pipeline->kernel_data_,
+ &device->launch_params.data,
+ sizeof(request.pipeline->kernel_data_));
+ request.pipeline->pso_type = pso_type;
request.pipeline->mtlDevice = mtlDevice;
- request.pipeline->source_md5 =
- device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
- request.pipeline->mtlLibrary =
- device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+ request.pipeline->source_md5 = device->source_md5[pso_type];
+ request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
request.pipeline->device_kernel = device_kernel;
request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
@@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
{
thread_scoped_lock lock(cache_mutex);
- pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+ auto &collection = pipelines[device_kernel];
+
+ /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
+ int max_entries_of_same_pso_type = 3;
+ for (int i = (int)collection.size() - 1; i >= 0; i--) {
+ if (collection[i]->pso_type == pso_type) {
+ max_entries_of_same_pso_type -= 1;
+ if (max_entries_of_same_pso_type == 0) {
+ metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+ kernel_type_as_string(pso_type),
+ device_kernel_as_string(device_kernel));
+ collection.erase(collection.begin() + i);
+ break;
+ }
+ }
+ }
+
+ collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
request_queue.push_back(request);
}
cond_var.notify_one();
@@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
continue;
}
- if (pipeline->scene_specialized) {
- if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+ if (pipeline->pso_type != PSO_GENERIC) {
+ if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+ pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
best_pipeline = pipeline.get();
}
}
@@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
}
}
+ if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
+ metal_printf("Swapping in %s version of %s\n",
+ kernel_type_as_string(best_pipeline->pso_type),
+ device_kernel_as_string(kernel));
+ }
+ best_pipeline->usage_count += 1;
+
return best_pipeline;
}
-void MetalKernelPipeline::compile()
+bool MetalKernelPipeline::should_use_binary_archive() const
{
- int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
+ if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+ if (atoi(str) != 0) {
+ /* Don't archive if we have opted out by env var. */
+ return false;
+ }
+ }
+
+ if (pso_type == PSO_GENERIC) {
+ /* Archive the generic kernels. */
+ return true;
+ }
+
+ if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+ device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ /* Archive all shade kernels - they take a long time to compile. */
+ return true;
+ }
+
+ /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
+ * but will be quick to regenerate if not. */
+ return false;
+}
+
+static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
+{
+ MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
+
+ MTLDataType MTLDataType_int = MTLDataTypeInt;
+ MTLDataType MTLDataType_float = MTLDataTypeFloat;
+ MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
+ KernelData zero_data = {0};
+ if (!data) {
+ data = &zero_data;
+ }
+# define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+ [constant_values setConstantValue:&data->parent.name \
+ type:MTLDataType_##_type \
+ atIndex:KernelData_##parent##_##name];
+
+# include "kernel/data_template.h"
+
+ return constant_values;
+}
+
+void MetalKernelPipeline::compile()
+{
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string(device_kernel);
@@ -281,6 +371,17 @@ void MetalKernelPipeline::compile()
if (@available(macOS 11.0, *)) {
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
func_desc.name = entryPoint;
+
+ if (pso_type == PSO_SPECIALIZED_SHADE) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else if (pso_type == PSO_SPECIALIZED_INTERSECT) {
+ func_desc.constantValues = GetConstantValues(&kernel_data_);
+ }
+ else {
+ func_desc.constantValues = GetConstantValues();
+ }
+
function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
}
@@ -427,10 +528,7 @@ void MetalKernelPipeline::compile()
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
- bool use_binary_archive = true;
- if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
- use_binary_archive = (atoi(str) == 0);
- }
+ bool use_binary_archive = should_use_binary_archive();
id<MTLBinaryArchive> archive = nil;
string metalbin_path;
@@ -608,19 +706,32 @@ void MetalKernelPipeline::compile()
}
}
-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
{
+ const double starttime = time_dt();
auto shader_cache = get_shader_cache(device->mtlDevice);
for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
- shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+ shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
}
- if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
- shader_cache->wait_for_all();
- }
+ shader_cache->wait_for_all();
+ metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+ time_dt() - starttime,
+ kernel_type_as_string(pso_type));
return true;
}
+bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
+{
+ auto shader_cache = get_shader_cache(device->mtlDevice);
+ for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+ if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+ return true;
+ }
+ }
+ return false;
+}
+
const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
DeviceKernel kernel)
{
diff --git a/intern/cycles/device/metal/queue.h b/intern/cycles/device/metal/queue.h
index b0bd487c86d..fc32740f3e1 100644
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states() const override;
+ virtual int num_sort_partition_elements() const override;
virtual void init_execution() override;
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 03e60b6bb6e..5ac63a16c61 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -293,6 +293,11 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
return result;
}
+int MetalDeviceQueue::num_sort_partition_elements() const
+{
+ return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
+}
+
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
@@ -359,7 +364,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
/* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
- sizeof(IntegratorStateGPU);
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
(uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
@@ -416,7 +421,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
- sizeof(IntegratorStateGPU);
+ offsetof(IntegratorStateGPU, sort_partition_divisor);
for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
int pointer_index = int(offset / sizeof(device_ptr));
MetalDevice::MetalMem *mmem = *(
diff --git a/intern/cycles/device/metal/util.h b/intern/cycles/device/metal/util.h
index fd32d8a260f..a988d01d361 100644
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -37,6 +37,7 @@ struct MetalInfo {
static int get_apple_gpu_core_count(id<MTLDevice> device);
static MetalGPUVendor get_device_vendor(id<MTLDevice> device);
static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device);
+ static int optimal_sort_partition_elements(id<MTLDevice> device);
static string get_device_name(id<MTLDevice> device);
};
diff --git a/intern/cycles/device/metal/util.mm b/intern/cycles/device/metal/util.mm
index a7a5b596b8f..65c67c400fe 100644
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -72,6 +72,21 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
return METAL_GPU_UNKNOWN;
}
+int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
+{
+ if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
+ return atoi(str);
+ }
+
+ /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
+ * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
+ * overall render time speedup of up to 15%. */
+ if (get_device_vendor(device) == METAL_GPU_APPLE) {
+ return 65536;
+ }
+ return 0;
+}
+
vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
{
static vector<id<MTLDevice>> usable_devices;
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
index b6f0f0c2b42..8056c204188 100644
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -76,7 +76,7 @@ bool device_oneapi_init()
/* NOTE(@nsirgien): we need to enable JIT cache from here and
* right now this cache policy is controlled by env. variables. */
/* NOTE(hallade) we also disable use of copy engine as it
- * improves stability as of intel/llvm sycl-nightly/20220529.
+ * improves stability as of intel/LLVM SYCL-nightly/20220529.
* All these env variable can be set beforehand by end-users and
* will in that case -not- be overwritten. */
# ifdef _WIN32
@@ -89,6 +89,9 @@ bool device_oneapi_init()
if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
_putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
}
+ if (getenv("SYCL_ENABLE_PCI") == nullptr) {
+ _putenv_s("SYCL_ENABLE_PCI", "1");
+ }
if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) {
_putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0");
}
@@ -96,6 +99,7 @@ bool device_oneapi_init()
setenv("SYCL_CACHE_PERSISTENT", "1", false);
setenv("SYCL_CACHE_THRESHOLD", "0", false);
setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+ setenv("SYCL_ENABLE_PCI", "1", false);
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
# endif
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index 8c8ab522b47..0c0afd1d2df 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -35,7 +35,7 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info,
oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
- /* Oneapi calls should be initialised on this moment. */
+ /* OneAPI calls should be initialized on this moment. */
assert(oneapi_dll_.oneapi_create_queue != nullptr);
bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
@@ -93,7 +93,7 @@ BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
bool OneapiDevice::load_kernels(const uint requested_features)
{
assert(device_queue_);
- /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with sertain feature set
+ /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with certain feature set
* with specialization constants, but it hasn't been implemented yet. */
(void)requested_features;
diff --git a/intern/cycles/device/oneapi/dll_interface.h b/intern/cycles/device/oneapi/dll_interface.h
index bc681ff8f64..0a888194e98 100644
--- a/intern/cycles/device/oneapi/dll_interface.h
+++ b/intern/cycles/device/oneapi/dll_interface.h
@@ -3,7 +3,7 @@
#pragma once
-/* Include kernel header to get access to sycl-specific types, like SyclQueue and
+/* Include kernel header to get access to SYCL-specific types, like SyclQueue and
* OneAPIDeviceIteratorCallback. */
#include "kernel/device/oneapi/kernel.h"
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
index 09a015303b6..716cbfdc88c 100644
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@@ -17,7 +17,7 @@ CCL_NAMESPACE_BEGIN
class OneapiDevice;
class device_memory;
-/* Base class for Oneapi queues. */
+/* Base class for OneAPI queues. */
class OneapiDeviceQueue : public DeviceQueue {
public:
explicit OneapiDeviceQueue(OneapiDevice *device);
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index e7dcc29a2da..151983667c0 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -26,7 +26,6 @@
# include "util/task.h"
# include "util/time.h"
-# undef __KERNEL_CPU__
# define __KERNEL_OPTIX__
# include "kernel/device/optix/globals.h"
@@ -2047,7 +2046,7 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
/* Update traversable handle (since it is different for each device on multi devices). */
KernelData *const data = (KernelData *)host;
- *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+ *(OptixTraversableHandle *)&data->device_bvh = tlas_handle;
update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
return;
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 366bf95269d..f0d49ad6f6c 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -8,7 +8,6 @@
# include "util/time.h"
-# undef __KERNEL_CPU__
# define __KERNEL_OPTIX__
# include "kernel/device/optix/globals.h"
diff --git a/intern/cycles/device/queue.h b/intern/cycles/device/queue.h
index 14a5db3a204..808431af401 100644
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -105,6 +105,13 @@ class DeviceQueue {
* value. */
virtual int num_concurrent_busy_states() const = 0;
+ /* Number of elements in a partition of sorted shaders, that improves memory locality of
+ * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
+ virtual int num_sort_partition_elements() const
+ {
+ return 65536;
+ }
+
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.