1 files changed, 166 insertions, 55 deletions
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index fec4cd80466..385cb412b06 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -5,6 +5,7 @@
 
 #  include "device/metal/kernel.h"
 #  include "device/metal/device_impl.h"
+#  include "kernel/device/metal/function_constants.h"
 #  include "util/md5.h"
 #  include "util/path.h"
 #  include "util/tbb.h"
@@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN
 /* limit to 2 MTLCompiler instances */
 int max_mtlcompiler_threads = 2;
 
-const char *kernel_type_as_string(int kernel_type)
+const char *kernel_type_as_string(MetalPipelineType pso_type)
 {
-  switch (kernel_type) {
+  switch (pso_type) {
     case PSO_GENERIC:
       return "PSO_GENERIC";
-    case PSO_SPECIALISED:
-      return "PSO_SPECIALISED";
+    case PSO_SPECIALIZED_INTERSECT:
+      return "PSO_SPECIALIZED_INTERSECT";
+    case PSO_SPECIALIZED_SHADE:
+      return "PSO_SPECIALIZED_SHADE";
     default:
       assert(0);
   }
@@ -50,7 +53,11 @@ struct ShaderCache {
 
   /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
    * device. */
-  void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+  void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
+
+  bool should_load_kernel(DeviceKernel device_kernel,
+                          MetalDevice *device,
+                          MetalPipelineType pso_type);
 
   void wait_for_all();
 
@@ -139,31 +146,34 @@ void ShaderCache::compile_thread_func(int thread_index)
   }
 }
 
-void ShaderCache::load_kernel(DeviceKernel device_kernel,
-                              MetalDevice *device,
-                              bool scene_specialized)
+bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
+                                     MetalDevice *device,
+                                     MetalPipelineType pso_type)
 {
-  {
-    /* create compiler threads on first run */
-    thread_scoped_lock lock(cache_mutex);
-    if (compile_threads.empty()) {
-      running = true;
-      for (int i = 0; i < max_mtlcompiler_threads; i++) {
-        compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
-      }
-    }
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+    /* Skip megakernel. */
+    return false;
   }
 
-  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-    /* skip megakernel */
-    return;
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
+      /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
+      return false;
+    }
   }
 
-  if (scene_specialized) {
+  if (pso_type != PSO_GENERIC) {
     /* Only specialize kernels where it can make an impact. */
     if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
         device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-      return;
+      return false;
+    }
+
+    /* Only specialize shading / intersection kernels as requested. */
+    bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
+    if (is_shade_pso != is_shade_kernel) {
+      return false;
     }
   }
 
@@ -171,35 +181,45 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
     /* check whether the kernel has already been requested / cached */
     thread_scoped_lock lock(cache_mutex);
     for (auto &pipeline : pipelines[device_kernel]) {
-      if (scene_specialized) {
-        if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
-          /* we already requested a pipeline that is specialized for this kernel data */
-          metal_printf("Specialized kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
-        }
+      if (pipeline->source_md5 == device->source_md5[pso_type]) {
+        return false;
       }
-      else {
-        if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
-          /* we already requested a generic pipeline for this kernel */
-          metal_printf("Generic kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
-        }
+    }
+  }
+
+  return true;
+}
+
+void ShaderCache::load_kernel(DeviceKernel device_kernel,
+                              MetalDevice *device,
+                              MetalPipelineType pso_type)
+{
+  {
+    /* create compiler threads on first run */
+    thread_scoped_lock lock(cache_mutex);
+    if (compile_threads.empty()) {
+      running = true;
+      for (int i = 0; i < max_mtlcompiler_threads; i++) {
+        compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
       }
     }
   }
 
+  if (!should_load_kernel(device_kernel, device, pso_type)) {
+    return;
+  }
+
   incomplete_requests++;
 
   PipelineRequest request;
   request.pipeline = new MetalKernelPipeline;
-  request.pipeline->scene_specialized = scene_specialized;
+  memcpy(&request.pipeline->kernel_data_,
+         &device->launch_params.data,
+         sizeof(request.pipeline->kernel_data_));
+  request.pipeline->pso_type = pso_type;
   request.pipeline->mtlDevice = mtlDevice;
-  request.pipeline->source_md5 =
-      device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
-  request.pipeline->mtlLibrary =
-      device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+  request.pipeline->source_md5 = device->source_md5[pso_type];
+  request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
   request.pipeline->device_kernel = device_kernel;
   request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
 
@@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
 
   {
     thread_scoped_lock lock(cache_mutex);
-    pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+    auto &collection = pipelines[device_kernel];
+
+    /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
+    int max_entries_of_same_pso_type = 3;
+    for (int i = (int)collection.size() - 1; i >= 0; i--) {
+      if (collection[i]->pso_type == pso_type) {
+        max_entries_of_same_pso_type -= 1;
+        if (max_entries_of_same_pso_type == 0) {
+          metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+                       kernel_type_as_string(pso_type),
+                       device_kernel_as_string(device_kernel));
+          collection.erase(collection.begin() + i);
+          break;
+        }
+      }
+    }
+
+    collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
     request_queue.push_back(request);
   }
   cond_var.notify_one();
@@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
       continue;
     }
 
-    if (pipeline->scene_specialized) {
-      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+    if (pipeline->pso_type != PSO_GENERIC) {
+      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+          pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
         best_pipeline = pipeline.get();
       }
     }
@@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
     }
   }
 
+  if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
+    metal_printf("Swapping in %s version of %s\n",
+                 kernel_type_as_string(best_pipeline->pso_type),
+                 device_kernel_as_string(kernel));
+  }
+  best_pipeline->usage_count += 1;
+
   return best_pipeline;
 }
 
-void MetalKernelPipeline::compile()
+bool MetalKernelPipeline::should_use_binary_archive() const
 {
-  int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
+  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+    if (atoi(str) != 0) {
+      /* Don't archive if we have opted out by env var. */
+      return false;
+    }
+  }
+
+  if (pso_type == PSO_GENERIC) {
+    /* Archive the generic kernels. */
+    return true;
+  }
+
+  if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+      device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    /* Archive all shade kernels - they take a long time to compile. */
+    return true;
+  }
+
+  /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
+   * but will be quick to regenerate if not. */
+  return false;
+}
+
+static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
+{
+  MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
+
+  MTLDataType MTLDataType_int = MTLDataTypeInt;
+  MTLDataType MTLDataType_float = MTLDataTypeFloat;
+  MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
+  KernelData zero_data = {0};
+  if (!data) {
+    data = &zero_data;
+  }
 
+#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+    [constant_values setConstantValue:&data->parent.name \
+                                 type:MTLDataType_##_type \
+                              atIndex:KernelData_##parent##_##name];
+
+#  include "kernel/data_template.h"
+
+  return constant_values;
+}
+
+void MetalKernelPipeline::compile()
+{
   const std::string function_name = std::string("cycles_metal_") +
                                     device_kernel_as_string(device_kernel);
 
@@ -281,6 +371,17 @@ void MetalKernelPipeline::compile()
   if (@available(macOS 11.0, *)) {
     MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
     func_desc.name = entryPoint;
+
+    if (pso_type == PSO_SPECIALIZED_SHADE) {
+      func_desc.constantValues = GetConstantValues(&kernel_data_);
+    }
+    else if (pso_type == PSO_SPECIALIZED_INTERSECT) {
+      func_desc.constantValues = GetConstantValues(&kernel_data_);
+    }
+    else {
+      func_desc.constantValues = GetConstantValues();
+    }
+
     function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
   }
 
@@ -427,10 +528,7 @@ void MetalKernelPipeline::compile()
 
   MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
 
-  bool use_binary_archive = true;
-  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
-    use_binary_archive = (atoi(str) == 0);
-  }
+  bool use_binary_archive = should_use_binary_archive();
 
   id<MTLBinaryArchive> archive = nil;
   string metalbin_path;
@@ -608,19 +706,32 @@ void MetalKernelPipeline::compile()
   }
 }
 
-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
 {
+  const double starttime = time_dt();
   auto shader_cache = get_shader_cache(device->mtlDevice);
   for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
-    shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+    shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
   }
 
-  if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
-    shader_cache->wait_for_all();
-  }
+  shader_cache->wait_for_all();
+  metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+               time_dt() - starttime,
+               kernel_type_as_string(pso_type));
   return true;
 }
 
+bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
+{
+  auto shader_cache = get_shader_cache(device->mtlDevice);
+  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+    if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
                                                                  DeviceKernel kernel)
 {