31 files changed, 1226 insertions, 623 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 9b4799d252f..9acc9e99ad0 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -24,12 +24,19 @@ from . import camera
 
 enum_devices = (
     ('CPU', "CPU", "Use CPU for rendering"),
-    ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in the system tab in the user preferences"),
+    ('GPU', "GPU Compute",
+        "Use GPU compute device for rendering, configured in the system tab in the user preferences"),
 )
 
 enum_feature_set = (
-    ('SUPPORTED', "Supported", "Only use finished and supported features"),
-    ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1),
+    ('SUPPORTED',
+     "Supported",
+     "Only use finished and supported features"),
+    ('EXPERIMENTAL',
+     "Experimental",
+     "Use experimental and incomplete features that might be broken or change in the future",
+     'ERROR',
+     1),
 )
 
 enum_displacement_methods = (
@@ -81,9 +88,14 @@ enum_sampling_pattern = (
 )
 
 enum_volume_sampling = (
-    ('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"),
-    ('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"),
-    ('MULTIPLE_IMPORTANCE', "Multiple Importance",
+    ('DISTANCE',
+     "Distance",
+     "Use distance sampling, best for dense volumes with lights far away"),
+    ('EQUIANGULAR',
+     "Equiangular",
+     "Use equiangular sampling, best for volumes with low density with light inside or near the volume"),
+    ('MULTIPLE_IMPORTANCE',
+     "Multiple Importance",
      "Combine distance and equi-angular sampling for volumes where neither method is ideal"),
 )
 
@@ -93,10 +105,15 @@ enum_volume_interpolation = (
 )
 
 enum_world_mis = (
-    ('NONE', "None", "Don't sample the background, faster but might cause noise for non-solid backgrounds"),
-    ('AUTOMATIC', "Auto", "Automatically try to determine the best setting"),
-    ('MANUAL', "Manual", "Manually set the resolution of the sampling map, higher values are slower and require more memory but reduce noise")
-)
+    ('NONE',
+     "None",
+     "Don't sample the background, faster but might cause noise for non-solid backgrounds"),
+    ('AUTOMATIC',
+     "Auto",
+     "Automatically try to determine the best setting"),
+    ('MANUAL',
+     "Manual",
+     "Manually set the resolution of the sampling map, higher values are slower and require more memory but reduce noise"))
 
 enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
@@ -210,17 +227,33 @@ enum_denoising_input_passes = (
 )
 
 enum_denoising_prefilter = (
-    ('NONE', "None", "No prefiltering, use when guiding passes are noise-free", 1),
-    ('FAST', "Fast", "Denoise color and guiding passes together. Improves quality when guiding passes are noisy using least amount of extra processing time", 2),
-    ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
+    ('NONE',
+     "None",
+     "No prefiltering, use when guiding passes are noise-free",
+     1),
+    ('FAST',
+     "Fast",
+     "Denoise color and guiding passes together. Improves quality when guiding passes are noisy using least amount of extra processing time",
+     2),
+    ('ACCURATE',
+     "Accurate",
+     "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time",
+     3),
 )
 
 enum_direct_light_sampling_type = (
-    ('MULTIPLE_IMPORTANCE_SAMPLING', "Multiple Importance Sampling",
-     "Multiple importance sampling is used to combine direct light contributions from next-event estimation and forward path tracing", 0),
-    ('FORWARD_PATH_TRACING', "Forward Path Tracing", "Direct light contributions are only sampled using forward path tracing", 1),
-    ('NEXT_EVENT_ESTIMATION', "Next-Event Estimation",
-     "Direct light contributions are only sampled using next-event estimation", 2),
+    ('MULTIPLE_IMPORTANCE_SAMPLING',
+     "Multiple Importance Sampling",
+     "Multiple importance sampling is used to combine direct light contributions from next-event estimation and forward path tracing",
+     0),
+    ('FORWARD_PATH_TRACING',
+     "Forward Path Tracing",
+     "Direct light contributions are only sampled using forward path tracing",
+     1),
+    ('NEXT_EVENT_ESTIMATION',
+     "Next-Event Estimation",
+     "Direct light contributions are only sampled using next-event estimation",
+     2),
 )
 
 
@@ -357,7 +390,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     scrambling_distance: FloatProperty(
         name="Scrambling Distance",
         default=1.0,
-        min=0.0, soft_max=1.0,
+        min=0.0,
+        soft_max=1.0,
         description="Reduce randomization between pixels to improve GPU rendering performance, at the cost of possible rendering artifacts if set too low",
     )
     preview_scrambling_distance: BoolProperty(
@@ -383,7 +417,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         name="Light Sampling Threshold",
         description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
         "Zero disables the test and never ignores lights",
-        min=0.0, max=1.0,
+        min=0.0,
+        max=1.0,
         default=0.01,
     )
 
@@ -395,7 +430,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     adaptive_threshold: FloatProperty(
         name="Adaptive Sampling Threshold",
         description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples",
-        min=0.0, max=1.0,
+        min=0.0,
+        max=1.0,
         soft_min=0.001,
         default=0.01,
         precision=4,
@@ -403,7 +439,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     adaptive_min_samples: IntProperty(
         name="Adaptive Min Samples",
         description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold",
-        min=0, max=4096,
+        min=0,
+        max=4096,
         default=0,
     )
 
@@ -415,7 +452,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     preview_adaptive_threshold: FloatProperty(
         name="Adaptive Sampling Threshold",
         description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples, for viewport renders",
-        min=0.0, max=1.0,
+        min=0.0,
+        max=1.0,
         soft_min=0.001,
         default=0.1,
         precision=4,
@@ -423,7 +461,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     preview_adaptive_min_samples: IntProperty(
         name="Adaptive Min Samples",
         description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold, for viewport renders",
-        min=0, max=4096,
+        min=0,
+        max=4096,
         default=0,
     )
 
@@ -550,7 +589,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     max_subdivisions: IntProperty(
         name="Max Subdivisions",
         description="Stop subdividing when this level is reached even if the dice rate would produce finer tessellation",
-        min=0, max=16,
+        min=0,
+        max=16,
         default=12,
     )
 
@@ -817,8 +857,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     debug_use_optix_debug: BoolProperty(
         name="OptiX Module Debug",
         description="Load OptiX module in debug mode: lower logging verbosity level, enable validations, and lower optimization level",
-        default=False
-    )
+        default=False)
 
     @classmethod
     def register(cls):
@@ -1185,7 +1224,8 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
     motion_steps: IntProperty(
         name="Motion Steps",
         description="Control accuracy of motion blur, more steps gives more memory usage (actual number of steps is 2^(steps - 1))",
-        min=1, max=7,
+        min=1,
+        max=7,
         default=1,
     )
 
@@ -1224,7 +1264,8 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
     shadow_terminator_geometry_offset: FloatProperty(
         name="Shadow Terminator Geometry Offset",
         description="Offset rays from the surface to reduce shadow terminator artifact on low poly geometry. Only affects triangles at grazing angles to light",
-        min=0.0, max=1.0,
+        min=0.0,
+        max=1.0,
         default=0.1,
     )
 
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 886f5345020..9d2dbdf6732 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1082,8 +1082,18 @@ class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
 
 
 def has_geometry_visibility(ob):
-    return ob and ((ob.type in {'MESH', 'CURVE', 'SURFACE', 'FONT', 'META', 'LIGHT', 'VOLUME', 'POINTCLOUD', 'CURVES'}) or
-                   (ob.instance_type == 'COLLECTION' and ob.instance_collection))
+    return ob and (
+        (ob.type in {
+            'MESH',
+            'CURVE',
+            'SURFACE',
+            'FONT',
+            'META',
+            'LIGHT',
+            'VOLUME',
+            'POINTCLOUD',
+            'CURVES',
+        }) or (ob.instance_type == 'COLLECTION' and ob.instance_collection))
 
 
 class CYCLES_OBJECT_PT_shading(CyclesButtonsPanel, Panel):
diff --git a/intern/cycles/blender/image.cpp b/intern/cycles/blender/image.cpp
index ca4c8f5904a..e01b72c1653 100644
--- a/intern/cycles/blender/image.cpp
+++ b/intern/cycles/blender/image.cpp
@@ -13,9 +13,11 @@ CCL_NAMESPACE_BEGIN
 
 BlenderImageLoader::BlenderImageLoader(BL::Image b_image,
                                        const int frame,
+                                       const int tile_number,
                                        const bool is_preview_render)
     : b_image(b_image),
       frame(frame),
+      tile_number(tile_number),
       /* Don't free cache for preview render to avoid race condition from T93560, to be fixed
          properly later as we are close to release. */
       free_cache(!is_preview_render && !b_image.has_data())
@@ -66,12 +68,11 @@ bool BlenderImageLoader::load_pixels(const ImageMetaData &metadata,
 {
   const size_t num_pixels = ((size_t)metadata.width) * metadata.height;
   const int channels = metadata.channels;
-  const int tile = 0; /* TODO(lukas): Support tiles here? */
 
   if (b_image.is_float()) {
     /* image data */
     float *image_pixels;
-    image_pixels = image_get_float_pixels_for_frame(b_image, frame, tile);
+    image_pixels = image_get_float_pixels_for_frame(b_image, frame, tile_number);
 
     if (image_pixels && num_pixels * channels == pixels_size) {
       memcpy(pixels, image_pixels, pixels_size * sizeof(float));
@@ -99,7 +100,7 @@ bool BlenderImageLoader::load_pixels(const ImageMetaData &metadata,
     }
   }
   else {
-    unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame, tile);
+    unsigned char *image_pixels = image_get_pixels_for_frame(b_image, frame, tile_number);
 
     if (image_pixels && num_pixels * channels == pixels_size) {
       memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
@@ -153,7 +154,13 @@ string BlenderImageLoader::name() const
 bool BlenderImageLoader::equals(const ImageLoader &other) const
 {
   const BlenderImageLoader &other_loader = (const BlenderImageLoader &)other;
-  return b_image == other_loader.b_image && frame == other_loader.frame;
+  return b_image == other_loader.b_image && frame == other_loader.frame &&
+         tile_number == other_loader.tile_number;
+}
+
+int BlenderImageLoader::get_tile_number() const
+{
+  return tile_number;
 }
 
 /* Point Density */
diff --git a/intern/cycles/blender/image.h b/intern/cycles/blender/image.h
index ee576b31f7e..c2cc0f51b9b 100644
--- a/intern/cycles/blender/image.h
+++ b/intern/cycles/blender/image.h
@@ -12,7 +12,10 @@ CCL_NAMESPACE_BEGIN
 
 class BlenderImageLoader : public ImageLoader {
  public:
-  BlenderImageLoader(BL::Image b_image, const int frame, const bool is_preview_render);
+  BlenderImageLoader(BL::Image b_image,
+                     const int frame,
+                     const int tile_number,
+                     const bool is_preview_render);
 
   bool load_metadata(const ImageDeviceFeatures &features, ImageMetaData &metadata) override;
   bool load_pixels(const ImageMetaData &metadata,
@@ -22,8 +25,11 @@ class BlenderImageLoader : public ImageLoader {
   string name() const override;
   bool equals(const ImageLoader &other) const override;
 
+  int get_tile_number() const override;
+
   BL::Image b_image;
   int frame;
+  int tile_number;
   bool free_cache;
 };
 
diff --git a/intern/cycles/blender/mesh.cpp b/intern/cycles/blender/mesh.cpp
index de67e27923d..c76ce3801d4 100644
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -316,7 +316,7 @@ static void fill_generic_attribute(BL::Mesh &b_mesh,
       break;
     }
     case BL::Attribute::domain_EDGE: {
-      /* Averge edge attributes at vertices. */
+      /* Average edge attributes at vertices. */
       const size_t num_verts = b_mesh.vertices.length();
       vector<int> count(num_verts, 0);
 
diff --git a/intern/cycles/blender/shader.cpp b/intern/cycles/blender/shader.cpp
index d3527567b96..81a64457c88 100644
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -355,6 +355,18 @@ static ShaderNode *add_node(Scene *scene,
   else if (b_node.is_a(&RNA_ShaderNodeCombineHSV)) {
     node = graph->create_node<CombineHSVNode>();
   }
+  else if (b_node.is_a(&RNA_ShaderNodeSeparateColor)) {
+    BL::ShaderNodeSeparateColor b_separate_node(b_node);
+    SeparateColorNode *separate_node = graph->create_node<SeparateColorNode>();
+    separate_node->set_color_type((NodeCombSepColorType)b_separate_node.mode());
+    node = separate_node;
+  }
+  else if (b_node.is_a(&RNA_ShaderNodeCombineColor)) {
+    BL::ShaderNodeCombineColor b_combine_node(b_node);
+    CombineColorNode *combine_node = graph->create_node<CombineColorNode>();
+    combine_node->set_color_type((NodeCombSepColorType)b_combine_node.mode());
+    node = combine_node;
+  }
   else if (b_node.is_a(&RNA_ShaderNodeSeparateXYZ)) {
     node = graph->create_node<SeparateXYZNode>();
   }
@@ -764,9 +776,21 @@ static ShaderNode *add_node(Scene *scene,
          */
         int scene_frame = b_scene.frame_current();
         int image_frame = image_user_frame_number(b_image_user, b_image, scene_frame);
-        image->handle = scene->image_manager->add_image(
-            new BlenderImageLoader(b_image, image_frame, b_engine.is_preview()),
-            image->image_params());
+        if (b_image.source() != BL::Image::source_TILED) {
+          image->handle = scene->image_manager->add_image(
+              new BlenderImageLoader(b_image, image_frame, 0, b_engine.is_preview()),
+              image->image_params());
+        }
+        else {
+          vector<ImageLoader *> loaders;
+          loaders.reserve(image->get_tiles().size());
+          for (int tile_number : image->get_tiles()) {
+            loaders.push_back(
+                new BlenderImageLoader(b_image, image_frame, tile_number, b_engine.is_preview()));
+          }
+
+          image->handle = scene->image_manager->add_image(loaders, image->image_params());
+        }
       }
       else {
         ustring filename = ustring(
@@ -802,7 +826,7 @@ static ShaderNode *add_node(Scene *scene,
         int scene_frame = b_scene.frame_current();
         int image_frame = image_user_frame_number(b_image_user, b_image, scene_frame);
         env->handle = scene->image_manager->add_image(
-            new BlenderImageLoader(b_image, image_frame, b_engine.is_preview()),
+            new BlenderImageLoader(b_image, image_frame, 0, b_engine.is_preview()),
             env->image_params());
       }
       else {
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index 27c58ce6d2f..7506b9b069f 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -28,7 +28,8 @@ class MetalDevice : public Device {
   id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
   id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
       nil; /* encoder used for fetching device pointers from MTLBuffers */
-  string source_used_for_compile[PSO_NUM];
+  string source[PSO_NUM];
+  string source_md5[PSO_NUM];
 
   KernelParamsMetal launch_params = {0};
 
@@ -72,7 +73,6 @@ class MetalDevice : public Device {
   id<MTLBuffer> texture_bindings_3d = nil;
   std::vector<id<MTLTexture>> texture_slot_map;
 
-  MetalDeviceKernels kernels;
   bool use_metalrt = false;
   bool use_function_specialisation = false;
 
@@ -110,6 +110,8 @@ class MetalDevice : public Device {
 
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
 
+  id<MTLLibrary> compile(string const &source);
+
   /* ------------------------------------------------------------------ */
   /* low-level memory management */
 
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index c01f51fb506..e1438a9d6e2 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -275,96 +275,44 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
    * active, but may still need to be rendered without motion blur if that isn't active as well. */
   motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
 
-  NSError *error = NULL;
+  source[PSO_GENERIC] = get_source(kernel_features);
+  mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
 
-  for (int i = 0; i < PSO_NUM; i++) {
-    if (mtlLibrary[i]) {
-      [mtlLibrary[i] release];
-      mtlLibrary[i] = nil;
-    }
-  }
+  MD5Hash md5;
+  md5.append(source[PSO_GENERIC]);
+  source_md5[PSO_GENERIC] = md5.get_hex();
+
+  metal_printf("Front-end compilation finished (generic)\n");
+
+  bool result = MetalDeviceKernels::load(this, false);
+
+  reserve_local_memory(kernel_features);
+
+  return result;
+}
 
+id<MTLLibrary> MetalDevice::compile(string const &source)
+{
   MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
 
   options.fastMathEnabled = YES;
   if (@available(macOS 12.0, *)) {
     options.languageVersion = MTLLanguageVersion2_4;
   }
-  else {
-    return false;
-  }
 
-  string metalsrc;
-
-  /* local helper: dump source to disk and return filepath */
-  auto dump_source = [&](int kernel_type) -> string {
-    string &source = source_used_for_compile[kernel_type];
-    string metalsrc = path_cache_get(path_join("kernels",
-                                               string_printf("%s.%s.metal",
-                                                             kernel_type_as_string(kernel_type),
-                                                             util_md5_string(source).c_str())));
-    path_write_text(metalsrc, source);
-    return metalsrc;
-  };
-
-  /* local helper: fetch the kernel source code, adjust it for specific PSO_.. kernel_type flavor,
-   * then compile it into a MTLLibrary */
-  auto fetch_and_compile_source = [&](int kernel_type) {
-    /* Record the source used to compile this library, for hash building later. */
-    string &source = source_used_for_compile[kernel_type];
-
-    switch (kernel_type) {
-      case PSO_GENERIC: {
-        source = get_source(kernel_features);
-        break;
-      }
-      case PSO_SPECIALISED: {
-        /* PSO_SPECIALISED derives from PSO_GENERIC */
-        string &generic_source = source_used_for_compile[PSO_GENERIC];
-        if (generic_source.empty()) {
-          generic_source = get_source(kernel_features);
-        }
-        source = "#define __KERNEL_METAL_USE_FUNCTION_SPECIALISATION__\n" + generic_source;
-        break;
-      }
-      default:
-        assert(0);
-    }
-
-    /* create MTLLibrary (front-end compilation) */
-    mtlLibrary[kernel_type] = [mtlDevice newLibraryWithSource:@(source.c_str())
+  NSError *error = NULL;
+  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
                                                       options:options
                                                         error:&error];
 
-    bool do_source_dump = (getenv("CYCLES_METAL_DUMP_SOURCE") != nullptr);
-
-    if (!mtlLibrary[kernel_type] || do_source_dump) {
-      string metalsrc = dump_source(kernel_type);
-
-      if (!mtlLibrary[kernel_type]) {
-        NSString *err = [error localizedDescription];
-        set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
-
-        return false;
-      }
-    }
-    return true;
-  };
-
-  fetch_and_compile_source(PSO_GENERIC);
-
-  if (use_function_specialisation) {
-    fetch_and_compile_source(PSO_SPECIALISED);
+  if (!mtlLibrary) {
+    NSString *err = [error localizedDescription];
+    set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
   }
 
-  metal_printf("Front-end compilation finished\n");
-
-  bool result = kernels.load(this, PSO_GENERIC);
-
   [options release];
-  reserve_local_memory(kernel_features);
 
-  return result;
+  return mtlLibrary;
 }
 
 void MetalDevice::reserve_local_memory(const uint kernel_features)
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index b12491d820d..69b2a686ecc 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -54,103 +54,41 @@ enum {
 const char *kernel_type_as_string(int kernel_type);
 
 struct MetalKernelPipeline {
-  void release()
-  {
-    if (pipeline) {
-      [pipeline release];
-      pipeline = nil;
-      if (@available(macOS 11.0, *)) {
-        for (int i = 0; i < METALRT_TABLE_NUM; i++) {
-          if (intersection_func_table[i]) {
-            [intersection_func_table[i] release];
-            intersection_func_table[i] = nil;
-          }
-        }
-      }
-    }
-    if (function) {
-      [function release];
-      function = nil;
-    }
-    if (@available(macOS 11.0, *)) {
-      for (int i = 0; i < METALRT_TABLE_NUM; i++) {
-        if (intersection_func_table[i]) {
-          [intersection_func_table[i] release];
-        }
-      }
-    }
-  }
 
-  bool loaded = false;
-  id<MTLFunction> function = nil;
-  id<MTLComputePipelineState> pipeline = nil;
-
-  API_AVAILABLE(macos(11.0))
-  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
-};
-
-struct MetalKernelLoadDesc {
-  int pso_index = 0;
-  const char *function_name = nullptr;
-  int kernel_index = 0;
-  int threads_per_threadgroup = 0;
-  MTLFunctionConstantValues *constant_values = nullptr;
-  NSArray *linked_functions = nullptr;
-
-  struct IntersectorFunctions {
-    NSArray *defaults;
-    NSArray *shadow;
-    NSArray *local;
-    NSArray *operator[](int index) const
-    {
-      if (index == METALRT_TABLE_DEFAULT)
-        return defaults;
-      if (index == METALRT_TABLE_SHADOW)
-        return shadow;
-      return local;
-    }
-  } intersector_functions = {nullptr};
-};
-
-/* Metal kernel and associate occupancy information. */
-class MetalDeviceKernel {
- public:
-  ~MetalDeviceKernel();
+  void compile();
 
-  bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
+  id<MTLLibrary> mtlLibrary = nil;
+  bool scene_specialized;
+  string source_md5;
 
-  void mark_loaded(int pso_index)
-  {
-    pso[pso_index].loaded = true;
-  }
+  bool use_metalrt;
+  bool metalrt_hair;
+  bool metalrt_hair_thick;
+  bool metalrt_pointcloud;
 
-  int get_num_threads_per_block() const
-  {
-    return num_threads_per_block;
-  }
-  const MetalKernelPipeline &get_pso() const;
+  int threads_per_threadgroup;
 
-  double load_duration = 0.0;
+  DeviceKernel device_kernel;
+  bool loaded = false;
+  id<MTLDevice> mtlDevice = nil;
+  id<MTLFunction> function = nil;
+  id<MTLComputePipelineState> pipeline = nil;
+  int num_threads_per_block = 0;
 
- private:
-  MetalKernelPipeline pso[PSO_NUM];
+  string error_str;
 
-  int num_threads_per_block = 0;
+  API_AVAILABLE(macos(11.0))
+  id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
+  id<MTLFunction> rt_intersection_function[METALRT_FUNC_NUM] = {nil};
 };
 
 /* Cache of Metal kernels for each DeviceKernel. */
-class MetalDeviceKernels {
- public:
-  bool load(MetalDevice *device, int kernel_type);
-  bool available(DeviceKernel kernel) const;
-  const MetalDeviceKernel &get(DeviceKernel kernel) const;
+namespace MetalDeviceKernels {
 
-  MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+bool load(MetalDevice *device, bool scene_specialized);
+const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);
 
-  id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
-
-  string loaded_md5[PSO_NUM];
-};
+} /* namespace MetalDeviceKernels */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 9555ca03c8e..304efc813ec 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -9,6 +9,7 @@
 #  include "util/path.h"
 #  include "util/tbb.h"
 #  include "util/time.h"
+#  include "util/unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,82 +29,376 @@ const char *kernel_type_as_string(int kernel_type)
   return "";
 }
 
-MetalDeviceKernel::~MetalDeviceKernel()
+bool kernel_has_intersection(DeviceKernel device_kernel)
 {
-  for (int i = 0; i < PSO_NUM; i++) {
-    pso[i].release();
+  return (device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+          device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE);
+}
+
+struct ShaderCache {
+  ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
+  {
+  }
+  ~ShaderCache();
+
+  /* Get the fastest available pipeline for the specified kernel. */
+  MetalKernelPipeline *get_best_pipeline(DeviceKernel kernel, const MetalDevice *device);
+
+  /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
+   * device. */
+  void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+
+  void wait_for_all();
+
+ private:
+  friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
+
+  void compile_thread_func(int thread_index);
+
+  using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
+
+  struct PipelineRequest {
+    MetalKernelPipeline *pipeline = nullptr;
+    std::function<void(MetalKernelPipeline *)> completionHandler;
+  };
+
+  std::mutex cache_mutex;
+
+  PipelineCollection pipelines[DEVICE_KERNEL_NUM];
+  id<MTLDevice> mtlDevice;
+
+  bool running = false;
+  std::condition_variable cond_var;
+  std::deque<PipelineRequest> request_queue;
+  std::vector<std::thread> compile_threads;
+  std::atomic_int incomplete_requests = 0;
+};
+
+std::mutex g_shaderCacheMutex;
+std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
+
+ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
+{
+  thread_scoped_lock lock(g_shaderCacheMutex);
+  auto it = g_shaderCache.find(mtlDevice);
+  if (it != g_shaderCache.end()) {
+    return it->second.get();
+  }
+
+  g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
+  return g_shaderCache[mtlDevice].get();
+}
+
+ShaderCache::~ShaderCache()
+{
+  metal_printf("ShaderCache shutting down with incomplete_requests = %d\n",
+               int(incomplete_requests));
+
+  running = false;
+  cond_var.notify_all();
+  for (auto &thread : compile_threads) {
+    thread.join();
+  }
+}
+
+void ShaderCache::wait_for_all()
+{
+  while (incomplete_requests > 0) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+}
+
+void ShaderCache::compile_thread_func(int thread_index)
+{
+  while (1) {
+
+    /* wait for / acquire next request */
+    PipelineRequest request;
+    {
+      thread_scoped_lock lock(cache_mutex);
+      cond_var.wait(lock, [&] { return !running || !request_queue.empty(); });
+      if (!running) {
+        break;
+      }
+
+      if (!request_queue.empty()) {
+        request = request_queue.front();
+        request_queue.pop_front();
+      }
+    }
+
+    /* service request */
+    if (request.pipeline) {
+      request.pipeline->compile();
+      incomplete_requests--;
+    }
   }
 }
 
-bool MetalDeviceKernel::load(MetalDevice *device,
-                             MetalKernelLoadDesc const &desc_in,
-                             MD5Hash const &md5)
+void ShaderCache::load_kernel(DeviceKernel device_kernel,
+                              MetalDevice *device,
+                              bool scene_specialized)
 {
-  __block MetalKernelLoadDesc const desc(desc_in);
-  if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+  {
+    /* create compiler threads on first run */
+    thread_scoped_lock lock(cache_mutex);
+    if (compile_threads.empty()) {
+      running = true;
+      for (int i = 0; i < max_mtlcompiler_threads; i++) {
+        compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
+      }
+    }
+  }
+
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
     /* skip megakernel */
-    return true;
+    return;
   }
 
-  bool use_binary_archive = true;
-  if (device->device_vendor == METAL_GPU_APPLE) {
-    /* Workaround for T94142: Cycles Metal crash with simultaneous viewport and final render */
-    use_binary_archive = false;
+  if (scene_specialized) {
+    /* Only specialize kernels where it can make an impact. */
+    if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+        device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      return;
+    }
   }
 
-  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
-    use_binary_archive = (atoi(str) == 0);
+  {
+    /* check whether the kernel has already been requested / cached */
+    thread_scoped_lock lock(cache_mutex);
+    for (auto &pipeline : pipelines[device_kernel]) {
+      if (scene_specialized) {
+        if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+          /* we already requested a pipeline that is specialized for this kernel data */
+          metal_printf("Specialized kernel already requested (%s)\n",
+                       device_kernel_as_string(device_kernel));
+          return;
+        }
+      }
+      else {
+        if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
+          /* we already requested a generic pipeline for this kernel */
+          metal_printf("Generic kernel already requested (%s)\n",
+                       device_kernel_as_string(device_kernel));
+          return;
+        }
+      }
+    }
   }
 
-  id<MTLBinaryArchive> archive = nil;
-  string metalbin_path;
-  if (use_binary_archive) {
-    NSProcessInfo *processInfo = [NSProcessInfo processInfo];
-    string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
-    MD5Hash local_md5(md5);
-    local_md5.append(osVersion);
-    string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() +
-                           to_string(desc.pso_index) + ".bin";
-    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
-    path_create_directories(metalbin_path);
+  incomplete_requests++;
+
+  PipelineRequest request;
+  request.pipeline = new MetalKernelPipeline;
+  request.pipeline->scene_specialized = scene_specialized;
+  request.pipeline->mtlDevice = mtlDevice;
+  request.pipeline->source_md5 =
+      device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+  request.pipeline->mtlLibrary =
+      device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+  request.pipeline->device_kernel = device_kernel;
+  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
+
+  /* metalrt options */
+  request.pipeline->use_metalrt = device->use_metalrt;
+  request.pipeline->metalrt_hair = device->use_metalrt &&
+                                   (device->kernel_features & KERNEL_FEATURE_HAIR);
+  request.pipeline->metalrt_hair_thick = device->use_metalrt &&
+                                         (device->kernel_features & KERNEL_FEATURE_HAIR_THICK);
+  request.pipeline->metalrt_pointcloud = device->use_metalrt &&
+                                         (device->kernel_features & KERNEL_FEATURE_POINTCLOUD);
+
+  {
+    thread_scoped_lock lock(cache_mutex);
+    pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+    request_queue.push_back(request);
+  }
+  cond_var.notify_one();
+}
 
-    if (path_exists(metalbin_path) && use_binary_archive) {
-      if (@available(macOS 11.0, *)) {
-        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
-        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
-        archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
-        [archiveDesc release];
+MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
+{
+  thread_scoped_lock lock(cache_mutex);
+  auto &collection = pipelines[kernel];
+  if (collection.empty()) {
+    return nullptr;
+  }
+
+  /* metalrt options */
+  bool use_metalrt = device->use_metalrt;
+  bool metalrt_hair = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR);
+  bool metalrt_hair_thick = use_metalrt && (device->kernel_features & KERNEL_FEATURE_HAIR_THICK);
+  bool metalrt_pointcloud = use_metalrt && (device->kernel_features & KERNEL_FEATURE_POINTCLOUD);
+
+  MetalKernelPipeline *best_pipeline = nullptr;
+  for (auto &pipeline : collection) {
+    if (!pipeline->loaded) {
+      /* still loading - ignore */
+      continue;
+    }
+
+    if (pipeline->use_metalrt != use_metalrt || pipeline->metalrt_hair != metalrt_hair ||
+        pipeline->metalrt_hair_thick != metalrt_hair_thick ||
+        pipeline->metalrt_pointcloud != metalrt_pointcloud) {
+      /* wrong combination of metalrt options */
+      continue;
+    }
+
+    if (pipeline->scene_specialized) {
+      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+        best_pipeline = pipeline.get();
       }
     }
+    else if (!best_pipeline) {
+      best_pipeline = pipeline.get();
+    }
+  }
+
+  return best_pipeline;
+}
+
+void MetalKernelPipeline::compile()
+{
+  int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
+
+  const std::string function_name = std::string("cycles_metal_") +
+                                    device_kernel_as_string(device_kernel);
+
+  int threads_per_threadgroup = this->threads_per_threadgroup;
+  if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
+      device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
+    /* Always use 512 for the sorting kernels */
+    threads_per_threadgroup = 512;
   }
 
-  NSString *entryPoint = [@(desc.function_name) copy];
+  NSString *entryPoint = [@(function_name.c_str()) copy];
 
   NSError *error = NULL;
   if (@available(macOS 11.0, *)) {
     MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
     func_desc.name = entryPoint;
-    if (desc.constant_values) {
-      func_desc.constantValues = desc.constant_values;
-    }
-    pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index]
-        newFunctionWithDescriptor:func_desc
-                            error:&error];
+    function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
   }
+
   [entryPoint release];
 
-  if (pso[desc.pso_index].function == nil) {
+  if (function == nil) {
     NSString *err = [error localizedDescription];
     string errors = [err UTF8String];
+    metal_printf("Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
+    return;
+  }
+
+  function.label = [entryPoint copy];
+
+  if (use_metalrt) {
+    if (@available(macOS 11.0, *)) {
+      /* create the id<MTLFunction> for each intersection function */
+      const char *function_names[] = {
+          "__anyhit__cycles_metalrt_visibility_test_tri",
+          "__anyhit__cycles_metalrt_visibility_test_box",
+          "__anyhit__cycles_metalrt_shadow_all_hit_tri",
+          "__anyhit__cycles_metalrt_shadow_all_hit_box",
+          "__anyhit__cycles_metalrt_local_hit_tri",
+          "__anyhit__cycles_metalrt_local_hit_box",
+          "__intersection__curve_ribbon",
+          "__intersection__curve_ribbon_shadow",
+          "__intersection__curve_all",
+          "__intersection__curve_all_shadow",
+          "__intersection__point",
+          "__intersection__point_shadow",
+      };
+      assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
+
+      MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
+      for (int i = 0; i < METALRT_FUNC_NUM; i++) {
+        const char *function_name = function_names[i];
+        desc.name = [@(function_name) copy];
+
+        NSError *error = NULL;
+        rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc error:&error];
 
-    device->set_error(
-        string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str()));
-    return false;
+        if (rt_intersection_function[i] == nil) {
+          NSString *err = [error localizedDescription];
+          string errors = [err UTF8String];
+
+          error_str = string_printf(
+              "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
+          break;
+        }
+
+        rt_intersection_function[i].label = [@(function_name) copy];
+      }
+    }
   }
 
-  pso[desc.pso_index].function.label = [@(desc.function_name) copy];
+  NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
+  NSArray *linked_functions = nil;
+
+  if (use_metalrt) {
+    id<MTLFunction> curve_intersect_default = nil;
+    id<MTLFunction> curve_intersect_shadow = nil;
+    id<MTLFunction> point_intersect_default = nil;
+    id<MTLFunction> point_intersect_shadow = nil;
+    if (metalrt_hair) {
+      /* Add curve intersection programs. */
+      if (metalrt_hair_thick) {
+        /* Slower programs for thick hair since that also slows down ribbons.
+         * Ideally this should not be needed. */
+        curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL];
+        curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW];
+      }
+      else {
+        curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON];
+        curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW];
+      }
+    }
+    if (metalrt_pointcloud) {
+      point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT];
+      point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW];
+    }
+    table_functions[METALRT_TABLE_DEFAULT] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI],
+                         curve_intersect_default ?
+                             curve_intersect_default :
+                             rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
+                         point_intersect_default ?
+                             point_intersect_default :
+                             rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
+                         nil];
+    table_functions[METALRT_TABLE_SHADOW] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI],
+                         curve_intersect_shadow ?
+                             curve_intersect_shadow :
+                             rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
+                         point_intersect_shadow ?
+                             point_intersect_shadow :
+                             rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
+                         nil];
+    table_functions[METALRT_TABLE_LOCAL] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
+                         nil];
+
+    NSMutableSet *unique_functions = [NSMutableSet
+        setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
+    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
+    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+
+    if (kernel_has_intersection(device_kernel)) {
+      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
+          sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
+            return [f1.label compare:f2.label];
+          }];
+    }
+    unique_functions = nil;
+  }
 
-  __block MTLComputePipelineDescriptor *computePipelineStateDescriptor =
+  MTLComputePipelineDescriptor *computePipelineStateDescriptor =
       [[MTLComputePipelineDescriptor alloc] init];
 
   computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
@@ -111,52 +406,86 @@ bool MetalDeviceKernel::load(MetalDevice *device,
   computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
 
   if (@available(macos 10.14, *)) {
-    computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup;
+    computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
   }
   computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
 
-  computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function;
+  computePipelineStateDescriptor.computeFunction = function;
+
   if (@available(macOS 11.0, *)) {
     /* Attach the additional functions to an MTLLinkedFunctions object */
-    if (desc.linked_functions) {
+    if (linked_functions) {
       computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
-      computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions;
+      computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
     }
-
     computePipelineStateDescriptor.maxCallStackDepth = 1;
+    if (use_metalrt) {
+      computePipelineStateDescriptor.maxCallStackDepth = 8;
+    }
   }
 
-  /* Create a new Compute pipeline state object */
   MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
 
-  bool creating_new_archive = false;
+  bool use_binary_archive = true;
+  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+    use_binary_archive = (atoi(str) == 0);
+  }
+
+  id<MTLBinaryArchive> archive = nil;
+  string metalbin_path;
+  string metalbin_name;
+  if (use_binary_archive) {
+    NSProcessInfo *processInfo = [NSProcessInfo processInfo];
+    string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
+    MD5Hash local_md5;
+    local_md5.append(source_md5);
+    local_md5.append(osVersion);
+    local_md5.append((uint8_t *)&this->threads_per_threadgroup,
+                     sizeof(this->threads_per_threadgroup));
+
+    string options;
+    if (use_metalrt && kernel_has_intersection(device_kernel)) {
+      /* incorporate any MetalRT specializations into the archive name */
+      options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d",
+                               metalrt_hair ? 1 : 0,
+                               metalrt_hair_thick ? 1 : 0,
+                               metalrt_pointcloud ? 1 : 0);
+    }
+
+    /* Replace non-alphanumerical characters with underscores. */
+    string device_name = [mtlDevice.name UTF8String];
+    for (char &c : device_name) {
+      if ((c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
+        c = '_';
+      }
+    }
+
+    metalbin_name = device_name;
+    metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
+    metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
+    metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin");
+
+    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
+    path_create_directories(metalbin_path);
+
+    if (path_exists(metalbin_path) && use_binary_archive) {
+      if (@available(macOS 11.0, *)) {
+        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
+        archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+        [archiveDesc release];
+      }
+    }
+  }
+
+  __block bool creating_new_archive = false;
   if (@available(macOS 11.0, *)) {
     if (use_binary_archive) {
       if (!archive) {
         MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
         archiveDesc.url = nil;
-        archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
+        archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
         creating_new_archive = true;
-
-        double starttime = time_dt();
-
-        if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
-                                                          error:&error]) {
-          NSString *errStr = [error localizedDescription];
-          metal_printf("Failed to add PSO to archive:\n%s\n",
-                       errStr ? [errStr UTF8String] : "nil");
-        }
-        else {
-          double duration = time_dt() - starttime;
-          metal_printf("%2d | %-55s | %7.2fs\n",
-                       desc.kernel_index,
-                       device_kernel_as_string((DeviceKernel)desc.kernel_index),
-                       duration);
-
-          if (desc.pso_index == PSO_GENERIC) {
-            this->load_duration = duration;
-          }
-        }
       }
       computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
       pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
@@ -170,17 +499,14 @@ bool MetalDeviceKernel::load(MetalDevice *device,
       MTLComputePipelineReflection *reflection,
       NSError *error) {
     bool recreate_archive = false;
-    if (computePipelineState == nil && archive && !creating_new_archive) {
-
-      assert(0);
-
+    if (computePipelineState == nil && archive) {
       NSString *errStr = [error localizedDescription];
       metal_printf(
           "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
           "(error: %s)\n",
-          device_kernel_as_string((DeviceKernel)desc.kernel_index),
+          device_kernel_as_string((DeviceKernel)device_kernel),
           errStr ? [errStr UTF8String] : "nil");
-      computePipelineState = [device->mtlDevice
+      computePipelineState = [mtlDevice
           newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
                                         options:MTLPipelineOptionNone
                                      reflection:nullptr
@@ -192,32 +518,23 @@ bool MetalDeviceKernel::load(MetalDevice *device,
 
     if (computePipelineState == nil) {
       NSString *errStr = [error localizedDescription];
-      device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n",
-                                      device_kernel_as_string((DeviceKernel)desc.kernel_index)) +
-                        (errStr ? [errStr UTF8String] : "nil"));
-      metal_printf("%2d | %-55s | %7.2fs | FAILED!\n",
-                   desc.kernel_index,
-                   device_kernel_as_string((DeviceKernel)desc.kernel_index),
+      error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
+                                device_kernel_as_string((DeviceKernel)device_kernel));
+      error_str += (errStr ? [errStr UTF8String] : "nil");
+      metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
+                   kernel_type_as_string(pso_type),
+                   device_kernel,
+                   device_kernel_as_string((DeviceKernel)device_kernel),
                    duration);
       return;
     }
 
-    pso[desc.pso_index].pipeline = computePipelineState;
-    num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
-                                       computePipelineState.threadExecutionWidth);
+    int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+                                           computePipelineState.threadExecutionWidth);
     num_threads_per_block = std::max(num_threads_per_block,
                                      (int)computePipelineState.threadExecutionWidth);
-
-    if (!use_binary_archive) {
-      metal_printf("%2d | %-55s | %7.2fs\n",
-                   desc.kernel_index,
-                   device_kernel_as_string((DeviceKernel)desc.kernel_index),
-                   duration);
-
-      if (desc.pso_index == PSO_GENERIC) {
-        this->load_duration = duration;
-      }
-    }
+    this->pipeline = computePipelineState;
+    this->num_threads_per_block = num_threads_per_block;
 
     if (@available(macOS 11.0, *)) {
       if (creating_new_archive || recreate_archive) {
@@ -228,304 +545,85 @@ bool MetalDeviceKernel::load(MetalDevice *device,
         }
       }
     }
+  };
 
-    [computePipelineStateDescriptor release];
-    computePipelineStateDescriptor = nil;
-
-    if (device->use_metalrt && desc.linked_functions) {
-      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
-        if (@available(macOS 11.0, *)) {
-          MTLIntersectionFunctionTableDescriptor *ift_desc =
-              [[MTLIntersectionFunctionTableDescriptor alloc] init];
-          ift_desc.functionCount = desc.intersector_functions[table].count;
-
-          pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline
-              newIntersectionFunctionTableWithDescriptor:ift_desc];
-
-          /* Finally write the function handles into this pipeline's table */
-          for (int i = 0; i < 2; i++) {
-            id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline
-                functionHandleWithFunction:desc.intersector_functions[table][i]];
-            [pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i];
-          }
+  /* Block on load to ensure we continue with a valid kernel function */
+  if (creating_new_archive) {
+    starttime = time_dt();
+    NSError *error;
+    if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
+                                                      error:&error]) {
+      NSString *errStr = [error localizedDescription];
+      metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
+    }
+  }
+  id<MTLComputePipelineState> pipeline = [mtlDevice
+      newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                    options:pipelineOptions
+                                 reflection:nullptr
+                                      error:&error];
+  completionHandler(pipeline, nullptr, error);
+
+  this->loaded = true;
+  [computePipelineStateDescriptor release];
+  computePipelineStateDescriptor = nil;
+
+  if (use_metalrt && linked_functions) {
+    for (int table = 0; table < METALRT_TABLE_NUM; table++) {
+      if (@available(macOS 11.0, *)) {
+        MTLIntersectionFunctionTableDescriptor *ift_desc =
+            [[MTLIntersectionFunctionTableDescriptor alloc] init];
+        ift_desc.functionCount = table_functions[table].count;
+        intersection_func_table[table] = [this->pipeline
+            newIntersectionFunctionTableWithDescriptor:ift_desc];
+
+        /* Finally write the function handles into this pipeline's table */
+        for (int i = 0; i < 2; i++) {
+          id<MTLFunctionHandle> handle = [pipeline
+              functionHandleWithFunction:table_functions[table][i]];
+          [intersection_func_table[table] setFunction:handle atIndex:i];
         }
       }
     }
+  }
 
-    mark_loaded(desc.pso_index);
-  };
+  double duration = time_dt() - starttime;
 
-  if (desc.pso_index == PSO_SPECIALISED) {
-    /* Asynchronous load */
-    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
-      NSError *error;
-      id<MTLComputePipelineState> pipeline = [device->mtlDevice
-          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                        options:pipelineOptions
-                                     reflection:nullptr
-                                          error:&error];
-      completionHandler(pipeline, nullptr, error);
-    });
+  if (!use_binary_archive) {
+    metal_printf("%16s | %2d | %-55s | %7.2fs\n",
+                 kernel_type_as_string(pso_type),
+                 int(device_kernel),
+                 device_kernel_as_string(device_kernel),
+                 duration);
   }
   else {
-    /* Block on load to ensure we continue with a valid kernel function */
-    id<MTLComputePipelineState> pipeline = [device->mtlDevice
-        newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                      options:pipelineOptions
-                                   reflection:nullptr
-                                        error:&error];
-    completionHandler(pipeline, nullptr, error);
-  }
-
-  return true;
-}
-
-const MetalKernelPipeline &MetalDeviceKernel::get_pso() const
-{
-  if (pso[PSO_SPECIALISED].loaded) {
-    return pso[PSO_SPECIALISED];
+    metal_printf("%16s | %2d | %-55s | %7.2fs | %s: %s\n",
+                 kernel_type_as_string(pso_type),
+                 device_kernel,
+                 device_kernel_as_string((DeviceKernel)device_kernel),
+                 duration,
+                 creating_new_archive ? " new" : "load",
+                 metalbin_name.c_str());
   }
-
-  assert(pso[PSO_GENERIC].loaded);
-  return pso[PSO_GENERIC];
 }
 
-bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
+bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
 {
-  bool any_error = false;
-
-  MD5Hash md5;
-
-  /* Build the function constant table */
-  MTLFunctionConstantValues *constant_values = nullptr;
-  if (kernel_type == PSO_SPECIALISED) {
-    constant_values = [MTLFunctionConstantValues new];
-
-#  define KERNEL_FILM(_type, name) \
-    [constant_values setConstantValue:&data.film.name \
-                                 type:get_MTLDataType_##_type() \
-                              atIndex:KernelData_film_##name]; \
-    md5.append((uint8_t *)&data.film.name, sizeof(data.film.name));
-
-#  define KERNEL_BACKGROUND(_type, name) \
-    [constant_values setConstantValue:&data.background.name \
-                                 type:get_MTLDataType_##_type() \
-                              atIndex:KernelData_background_##name]; \
-    md5.append((uint8_t *)&data.background.name, sizeof(data.background.name));
-
-#  define KERNEL_INTEGRATOR(_type, name) \
-    [constant_values setConstantValue:&data.integrator.name \
-                                 type:get_MTLDataType_##_type() \
-                              atIndex:KernelData_integrator_##name]; \
-    md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name));
-
-#  define KERNEL_BVH(_type, name) \
-    [constant_values setConstantValue:&data.bvh.name \
-                                 type:get_MTLDataType_##_type() \
-                              atIndex:KernelData_bvh_##name]; \
-    md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name));
-
-    /* METAL_WIP: populate constant_values based on KernelData */
-    assert(0);
-    /*
-        const KernelData &data = device->launch_params.data;
-    #    include "kernel/types/background.h"
-    #    include "kernel/types/bvh.h"
-    #    include "kernel/types/film.h"
-    #    include "kernel/types/integrator.h"
-    */
-  }
-
-  if (device->use_metalrt) {
-    if (@available(macOS 11.0, *)) {
-      /* create the id<MTLFunction> for each intersection function */
-      const char *function_names[] = {
-          "__anyhit__cycles_metalrt_visibility_test_tri",
-          "__anyhit__cycles_metalrt_visibility_test_box",
-          "__anyhit__cycles_metalrt_shadow_all_hit_tri",
-          "__anyhit__cycles_metalrt_shadow_all_hit_box",
-          "__anyhit__cycles_metalrt_local_hit_tri",
-          "__anyhit__cycles_metalrt_local_hit_box",
-          "__intersection__curve_ribbon",
-          "__intersection__curve_ribbon_shadow",
-          "__intersection__curve_all",
-          "__intersection__curve_all_shadow",
-          "__intersection__point",
-          "__intersection__point_shadow",
-      };
-      assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
-
-      MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
-      if (kernel_type == PSO_SPECIALISED) {
-        desc.constantValues = constant_values;
-      }
-      for (int i = 0; i < METALRT_FUNC_NUM; i++) {
-        const char *function_name = function_names[i];
-        desc.name = [@(function_name) copy];
-
-        NSError *error = NULL;
-        rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type]
-            newFunctionWithDescriptor:desc
-                                error:&error];
-
-        if (rt_intersection_funcs[kernel_type][i] == nil) {
-          NSString *err = [error localizedDescription];
-          string errors = [err UTF8String];
-
-          device->set_error(string_printf(
-              "Error getting intersection function \"%s\": %s", function_name, errors.c_str()));
-          any_error = true;
-          break;
-        }
-
-        rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy];
-      }
-    }
-  }
-  md5.append(device->source_used_for_compile[kernel_type]);
-
-  string hash = md5.get_hex();
-  if (loaded_md5[kernel_type] == hash) {
-    return true;
-  }
-
-  if (!any_error) {
-    NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
-    NSArray *function_list = nil;
-
-    if (device->use_metalrt) {
-      id<MTLFunction> curve_intersect_default = nil;
-      id<MTLFunction> curve_intersect_shadow = nil;
-      id<MTLFunction> point_intersect_default = nil;
-      id<MTLFunction> point_intersect_shadow = nil;
-      if (device->kernel_features & KERNEL_FEATURE_HAIR) {
-        /* Add curve intersection programs. */
-        if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
-          /* Slower programs for thick hair since that also slows down ribbons.
-           * Ideally this should not be needed. */
-          curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
-          curve_intersect_shadow =
-              rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
-        }
-        else {
-          curve_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
-          curve_intersect_shadow =
-              rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
-        }
-      }
-      if (device->kernel_features & KERNEL_FEATURE_POINTCLOUD) {
-        point_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT];
-        point_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_POINT_SHADOW];
-      }
-      table_functions[METALRT_TABLE_DEFAULT] = [NSArray
-          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
-                           curve_intersect_default ?
-                               curve_intersect_default :
-                               rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
-                           point_intersect_default ?
-                               point_intersect_default :
-                               rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
-                           nil];
-      table_functions[METALRT_TABLE_SHADOW] = [NSArray
-          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
-                           curve_intersect_shadow ?
-                               curve_intersect_shadow :
-                               rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
-                           point_intersect_shadow ?
-                               point_intersect_shadow :
-                               rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
-                           nil];
-      table_functions[METALRT_TABLE_LOCAL] = [NSArray
-          arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
-                           rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
-                           rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
-                           nil];
-
-      NSMutableSet *unique_functions = [NSMutableSet
-          setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
-      [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
-      [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
-
-      function_list = [[NSArray arrayWithArray:[unique_functions allObjects]]
-          sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
-            return [f1.label compare:f2.label];
-          }];
-
-      unique_functions = nil;
-    }
-
-    metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n",
-                 kernel_type_as_string(kernel_type));
-
-    tbb::task_arena local_arena(max_mtlcompiler_threads);
-    local_arena.execute([&]() {
-      parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) {
-        /* skip megakernel */
-        if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-          return;
-        }
-
-        /* Only specialize kernels where it can make an impact. */
-        if (kernel_type == PSO_SPECIALISED) {
-          if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-              i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-            return;
-          }
-        }
-
-        MetalDeviceKernel &kernel = kernels_[i];
-
-        const std::string function_name = std::string("cycles_metal_") +
-                                          device_kernel_as_string((DeviceKernel)i);
-        int threads_per_threadgroup = device->max_threads_per_threadgroup;
-        if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) {
-          /* Always use 512 for the sorting kernels */
-          threads_per_threadgroup = 512;
-        }
-
-        NSArray *kernel_function_list = nil;
-
-        if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-            i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
-            i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
-          kernel_function_list = function_list;
-        }
-
-        MetalKernelLoadDesc desc;
-        desc.pso_index = kernel_type;
-        desc.kernel_index = i;
-        desc.linked_functions = kernel_function_list;
-        desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT];
-        desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW];
-        desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL];
-        desc.constant_values = constant_values;
-        desc.threads_per_threadgroup = threads_per_threadgroup;
-        desc.function_name = function_name.c_str();
-
-        bool success = kernel.load(device, desc, md5);
-
-        any_error |= !success;
-      });
-    });
+  auto shader_cache = get_shader_cache(device->mtlDevice);
+  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+    shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
   }
 
-  bool loaded = !any_error;
-  if (loaded) {
-    loaded_md5[kernel_type] = hash;
+  if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
+    shader_cache->wait_for_all();
   }
-  return loaded;
-}
-
-const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const
-{
-  return kernels_[(int)kernel];
+  return true;
 }
 
-bool MetalDeviceKernels::available(DeviceKernel kernel) const
+const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
+                                                                 DeviceKernel kernel)
 {
-  return kernels_[(int)kernel].get_pso().function != nil;
+  return get_shader_cache(device->mtlDevice)->get_best_pipeline(kernel, device);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 1686ab95ffa..ec10e091b25 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -108,9 +108,6 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
   VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
           << work_size;
 
-  const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
-  const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
-
   id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
 
   /* Determine size requirement for argument buffer. */
@@ -212,6 +209,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
   }
   bytes_written = globals_offsets + sizeof(KernelParamsMetal);
 
+  const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(metal_device,
+                                                                                      kernel);
+  if (!metal_kernel_pso) {
+    metal_device->set_error(
+        string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel)));
+    return false;
+  }
+
   /* Encode ancillaries */
   [metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
   [metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
@@ -228,14 +233,14 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
       }
 
       for (int table = 0; table < METALRT_TABLE_NUM; table++) {
-        if (metal_kernel_pso.intersection_func_table[table]) {
-          [metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer
-                                                              offset:globals_offsets
-                                                             atIndex:1];
+        if (metal_kernel_pso->intersection_func_table[table]) {
+          [metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer
+                                                               offset:globals_offsets
+                                                              atIndex:1];
           [metal_device->mtlAncillaryArgEncoder
-              setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table]
+              setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
                                    atIndex:3 + table];
-          [mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table]
+          [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
                                           usage:MTLResourceUsageRead];
         }
         else {
@@ -281,10 +286,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
     }
   }
 
-  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
+  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline];
 
   /* Compute kernel launch parameters. */
-  const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
+  const int num_threads_per_block = metal_kernel_pso->num_threads_per_block;
 
   int shared_mem_bytes = 0;
 
@@ -314,7 +319,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
                            threadsPerThreadgroup:size_threads_per_threadgroup];
 
   [mtlCommandBuffer addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
-    NSString *kernel_name = metal_kernel_pso.function.label;
+    NSString *kernel_name = metal_kernel_pso->function.label;
 
     /* Enhanced command buffer errors are only available in 11.0+ */
     if (@available(macos 11.0, *)) {
@@ -547,6 +552,8 @@ id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel
         computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
                                                            MTLDispatchTypeSerial];
 
+    [mtlComputeEncoder setLabel:@(device_kernel_as_string(kernel))];
+
     /* declare usage of MTLBuffers etc */
     prepare_resources(kernel);
   }
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 8830d8c44ac..9fc265bc327 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -23,6 +23,7 @@
 #  include "util/md5.h"
 #  include "util/path.h"
 #  include "util/progress.h"
+#  include "util/task.h"
 #  include "util/time.h"
 
 #  undef __KERNEL_CPU__
@@ -216,6 +217,25 @@ static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
   return OPTIX_SUCCESS;
 }
 
+#  if OPTIX_ABI_VERSION >= 55
+static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
+{
+  OptixTask additional_tasks[16];
+  unsigned int num_additional_tasks = 0;
+
+  const OptixResult result = optixTaskExecute(task, additional_tasks, 16, &num_additional_tasks);
+  if (result == OPTIX_SUCCESS) {
+    for (unsigned int i = 0; i < num_additional_tasks; ++i) {
+      pool.push(function_bind(
+          &execute_optix_task, std::ref(pool), additional_tasks[i], std::ref(failure_reason)));
+    }
+  }
+  else {
+    failure_reason = result;
+  }
+}
+#  endif
+
 }  // namespace
 
 OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
@@ -453,6 +473,23 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       return false;
     }
 
+#  if OPTIX_ABI_VERSION >= 55
+    OptixTask task = nullptr;
+    OptixResult result = optixModuleCreateFromPTXWithTasks(context,
+                                                           &module_options,
+                                                           &pipeline_options,
+                                                           ptx_data.data(),
+                                                           ptx_data.size(),
+                                                           nullptr,
+                                                           nullptr,
+                                                           &optix_module,
+                                                           &task);
+    if (result == OPTIX_SUCCESS) {
+      TaskPool pool;
+      execute_optix_task(pool, task, result);
+      pool.wait_work();
+    }
+#  else
     const OptixResult result = optixModuleCreateFromPTX(context,
                                                         &module_options,
                                                         &pipeline_options,
@@ -461,6 +498,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                                         nullptr,
                                                         0,
                                                         &optix_module);
+#  endif
     if (result != OPTIX_SUCCESS) {
       set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
                               ptx_filename.c_str(),
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index d97854a52d0..473bdb67920 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -145,6 +145,7 @@ set(SRC_KERNEL_SVM_HEADERS
   svm/normal.h
   svm/ramp.h
   svm/ramp_util.h
+  svm/sepcomb_color.h
   svm/sepcomb_hsv.h
   svm/sepcomb_vector.h
   svm/sky.h
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 4e309f16c08..0ed52074a90 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -29,10 +29,26 @@ using namespace metal::raytracing;
 
 /* Qualifiers */
 
-#define ccl_device
-#define ccl_device_inline ccl_device
-#define ccl_device_forceinline ccl_device
-#define ccl_device_noinline ccl_device __attribute__((noinline))
+#if defined(__KERNEL_METAL_APPLE__)
+
+/* Inline everything for Apple GPUs.
+ * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
+ * at the cost of longer compile times (~4.5 minutes on M1 Max). */
+
+#  define ccl_device __attribute__((always_inline))
+#  define ccl_device_inline __attribute__((always_inline))
+#  define ccl_device_forceinline __attribute__((always_inline))
+#  define ccl_device_noinline __attribute__((always_inline))
+
+#else
+
+#  define ccl_device
+#  define ccl_device_inline ccl_device
+#  define ccl_device_forceinline ccl_device
+#  define ccl_device_noinline ccl_device __attribute__((noinline))
+
+#endif
+
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index 832498f1f73..e2e10b5b83f 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -1304,8 +1304,38 @@ bool OSLRenderServices::texture(ustring filename,
       break;
     }
     case OSLTextureHandle::SVM: {
-      /* Packed texture. */
-      float4 rgba = kernel_tex_image_interp(kernel_globals, handle->svm_slot, s, 1.0f - t);
+      int id = -1;
+      if (handle->svm_slots[0].w == -1) {
+        /* Packed single texture. */
+        id = handle->svm_slots[0].y;
+      }
+      else {
+        /* Packed tiled texture. */
+        int tx = (int)s;
+        int ty = (int)t;
+        int tile = 1001 + 10 * ty + tx;
+        for (int4 tile_node : handle->svm_slots) {
+          if (tile_node.x == tile) {
+            id = tile_node.y;
+            break;
+          }
+          if (tile_node.z == tile) {
+            id = tile_node.w;
+            break;
+          }
+        }
+        s -= tx;
+        t -= ty;
+      }
+
+      float4 rgba;
+      if (id == -1) {
+        rgba = make_float4(
+            TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+      }
+      else {
+        rgba = kernel_tex_image_interp(kernel_globals, id, s, 1.0f - t);
+      }
 
       result[0] = rgba[0];
       if (nchannels > 1)
@@ -1319,7 +1349,7 @@ bool OSLRenderServices::texture(ustring filename,
     }
     case OSLTextureHandle::IES: {
       /* IES light. */
-      result[0] = kernel_ies_interp(kernel_globals, handle->svm_slot, s, t);
+      result[0] = kernel_ies_interp(kernel_globals, handle->svm_slots[0].y, s, t);
       status = true;
       break;
     }
@@ -1413,7 +1443,7 @@ bool OSLRenderServices::texture3d(ustring filename,
       /* Packed texture. */
       ShaderData *sd = (ShaderData *)(sg->renderstate);
       KernelGlobals kernel_globals = sd->osl_globals;
-      int slot = handle->svm_slot;
+      int slot = handle->svm_slots[0].y;
       float3 P_float3 = make_float3(P.x, P.y, P.z);
       float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
 
diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h
index 653fa017140..edffd912bad 100644
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -39,18 +39,26 @@ struct KernelGlobalsCPU;
  * with additional data.
  *
  * These are stored in a concurrent hash map, because OSL can compile multiple
- * shaders in parallel. */
+ * shaders in parallel.
+ *
+ * NOTE: The svm_slots array contains a compressed mapping of tile to svm_slot pairs
+ * stored as follows: x:tile_a, y:svm_slot_a, z:tile_b, w:svm_slot_b etc. */
 
 struct OSLTextureHandle : public OIIO::RefCnt {
   enum Type { OIIO, SVM, IES, BEVEL, AO };
 
+  OSLTextureHandle(Type type, const vector<int4> &svm_slots)
+      : type(type), svm_slots(svm_slots), oiio_handle(NULL), processor(NULL)
+  {
+  }
+
   OSLTextureHandle(Type type = OIIO, int svm_slot = -1)
-      : type(type), svm_slot(svm_slot), oiio_handle(NULL), processor(NULL)
+      : OSLTextureHandle(type, {make_int4(0, svm_slot, -1, -1)})
   {
   }
 
   Type type;
-  int svm_slot;
+  vector<int4> svm_slots;
   OSL::TextureSystem::TextureHandle *oiio_handle;
   ColorSpaceProcessor *processor;
 };
diff --git a/intern/cycles/kernel/osl/shaders/CMakeLists.txt b/intern/cycles/kernel/osl/shaders/CMakeLists.txt
index 7ced21c5670..741bce7c399 100644
--- a/intern/cycles/kernel/osl/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/shaders/CMakeLists.txt
@@ -16,6 +16,7 @@ set(SRC_OSL
   node_camera.osl
   node_checker_texture.osl
   node_clamp.osl
+  node_combine_color.osl
   node_combine_rgb.osl
   node_combine_hsv.osl
   node_combine_xyz.osl
@@ -68,6 +69,7 @@ set(SRC_OSL
   node_refraction_bsdf.osl
   node_rgb_curves.osl
   node_rgb_ramp.osl
+  node_separate_color.osl
   node_separate_rgb.osl
   node_separate_hsv.osl
   node_separate_xyz.osl
diff --git a/intern/cycles/kernel/osl/shaders/node_color.h b/intern/cycles/kernel/osl/shaders/node_color.h
index 388dd114e9a..06735f5b03d 100644
--- a/intern/cycles/kernel/osl/shaders/node_color.h
+++ b/intern/cycles/kernel/osl/shaders/node_color.h
@@ -148,3 +148,53 @@ color hsv_to_rgb(color hsv)
 
   return rgb;
 }
+
+color rgb_to_hsl(color rgb)
+{
+  float cmax, cmin, h, s, l;
+
+  cmax = max(rgb[0], max(rgb[1], rgb[2]));
+  cmin = min(rgb[0], min(rgb[1], rgb[2]));
+  l = min(1.0, (cmax + cmin) / 2.0);
+
+  if (cmax == cmin) {
+    h = s = 0.0; /* achromatic */
+  }
+  else {
+    float cdelta = cmax - cmin;
+    s = l > 0.5 ? cdelta / (2.0 - cmax - cmin) : cdelta / (cmax + cmin);
+    if (cmax == rgb[0]) {
+      h = (rgb[1] - rgb[2]) / cdelta + (rgb[1] < rgb[2] ? 6.0 : 0.0);
+    }
+    else if (cmax == rgb[1]) {
+      h = (rgb[2] - rgb[0]) / cdelta + 2.0;
+    }
+    else {
+      h = (rgb[0] - rgb[1]) / cdelta + 4.0;
+    }
+  }
+  h /= 6.0;
+
+  return color(h, s, l);
+}
+
+color hsl_to_rgb(color hsl)
+{
+  float nr, ng, nb, chroma, h, s, l;
+
+  h = hsl[0];
+  s = hsl[1];
+  l = hsl[2];
+
+  nr = abs(h * 6.0 - 3.0) - 1.0;
+  ng = 2.0 - abs(h * 6.0 - 2.0);
+  nb = 2.0 - abs(h * 6.0 - 4.0);
+
+  nr = clamp(nr, 0.0, 1.0);
+  nb = clamp(nb, 0.0, 1.0);
+  ng = clamp(ng, 0.0, 1.0);
+
+  chroma = (1.0 - abs(2.0 * l - 1.0)) * s;
+
+  return color((nr - 0.5) * chroma + l, (ng - 0.5) * chroma + l, (nb - 0.5) * chroma + l);
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_combine_color.osl b/intern/cycles/kernel/osl/shaders/node_combine_color.osl
new file mode 100644
index 00000000000..681a592d2bb
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_combine_color.osl
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "stdcycles.h"
+
+shader node_combine_color(string color_type = "rgb",
+                          float Red = 0.0,
+                          float Green = 0.0,
+                          float Blue = 0.0,
+                          output color Color = 0.8)
+{
+  if (color_type == "rgb" || color_type == "hsv" || color_type == "hsl")
+    Color = color(color_type, Red, Green, Blue);
+  else
+    warning("%s", "Unknown color space!");
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_separate_color.osl b/intern/cycles/kernel/osl/shaders/node_separate_color.osl
new file mode 100644
index 00000000000..6f3e3149d8e
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_separate_color.osl
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "node_color.h"
+#include "stdcycles.h"
+
+shader node_separate_color(string color_type = "rgb",
+                           color Color = 0.8,
+                           output float Red = 0.0,
+                           output float Green = 0.0,
+                           output float Blue = 0.0)
+{
+  color col;
+  if (color_type == "rgb")
+    col = Color;
+  else if (color_type == "hsv")
+    col = rgb_to_hsv(Color);
+  else if (color_type == "hsl")
+    col = rgb_to_hsl(Color);
+  else
+    warning("%s", "Unknown color space!");
+
+  Red = col[0];
+  Green = col[1];
+  Blue = col[2];
+}
diff --git a/intern/cycles/kernel/svm/color_util.h b/intern/cycles/kernel/svm/color_util.h
index b439721383c..fa22d4bc8c2 100644
--- a/intern/cycles/kernel/svm/color_util.h
+++ b/intern/cycles/kernel/svm/color_util.h
@@ -307,4 +307,30 @@ ccl_device_inline float3 svm_brightness_contrast(float3 color, float brightness,
   return color;
 }
 
+ccl_device float3 svm_combine_color(NodeCombSepColorType type, float3 color)
+{
+  switch (type) {
+    case NODE_COMBSEP_COLOR_HSV:
+      return hsv_to_rgb(color);
+    case NODE_COMBSEP_COLOR_HSL:
+      return hsl_to_rgb(color);
+    case NODE_COMBSEP_COLOR_RGB:
+    default:
+      return color;
+  }
+}
+
+ccl_device float3 svm_separate_color(NodeCombSepColorType type, float3 color)
+{
+  switch (type) {
+    case NODE_COMBSEP_COLOR_HSV:
+      return rgb_to_hsv(color);
+    case NODE_COMBSEP_COLOR_HSL:
+      return rgb_to_hsl(color);
+    case NODE_COMBSEP_COLOR_RGB:
+    default:
+      return color;
+  }
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/sepcomb_color.h b/intern/cycles/kernel/svm/sepcomb_color.h
new file mode 100644
index 00000000000..d186e7f163b
--- /dev/null
+++ b/intern/cycles/kernel/svm/sepcomb_color.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_noinline void svm_node_combine_color(KernelGlobals kg,
+                                                ccl_private ShaderData *sd,
+                                                ccl_private float *stack,
+                                                uint color_type,
+                                                uint inputs_stack_offsets,
+                                                uint result_stack_offset)
+{
+  uint red_stack_offset, green_stack_offset, blue_stack_offset;
+  svm_unpack_node_uchar3(
+      inputs_stack_offsets, &red_stack_offset, &green_stack_offset, &blue_stack_offset);
+
+  float r = stack_load_float(stack, red_stack_offset);
+  float g = stack_load_float(stack, green_stack_offset);
+  float b = stack_load_float(stack, blue_stack_offset);
+
+  /* Combine, and convert back to RGB */
+  float3 color = svm_combine_color((NodeCombSepColorType)color_type, make_float3(r, g, b));
+
+  if (stack_valid(result_stack_offset))
+    stack_store_float3(stack, result_stack_offset, color);
+}
+
+ccl_device_noinline void svm_node_separate_color(KernelGlobals kg,
+                                                 ccl_private ShaderData *sd,
+                                                 ccl_private float *stack,
+                                                 uint color_type,
+                                                 uint input_stack_offset,
+                                                 uint results_stack_offsets)
+{
+  float3 color = stack_load_float3(stack, input_stack_offset);
+
+  /* Convert color space */
+  color = svm_separate_color((NodeCombSepColorType)color_type, color);
+
+  uint red_stack_offset, green_stack_offset, blue_stack_offset;
+  svm_unpack_node_uchar3(
+      results_stack_offsets, &red_stack_offset, &green_stack_offset, &blue_stack_offset);
+
+  if (stack_valid(red_stack_offset))
+    stack_store_float(stack, red_stack_offset, color.x);
+  if (stack_valid(green_stack_offset))
+    stack_store_float(stack, green_stack_offset, color.y);
+  if (stack_valid(blue_stack_offset))
+    stack_store_float(stack, blue_stack_offset, color.z);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 08352a6231f..5def943c87f 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -181,6 +181,7 @@ CCL_NAMESPACE_END
 #include "kernel/svm/noisetex.h"
 #include "kernel/svm/normal.h"
 #include "kernel/svm/ramp.h"
+#include "kernel/svm/sepcomb_color.h"
 #include "kernel/svm/sepcomb_hsv.h"
 #include "kernel/svm/sepcomb_vector.h"
 #include "kernel/svm/sky.h"
@@ -508,6 +509,12 @@ ccl_device void svm_eval_nodes(KernelGlobals kg,
       case NODE_MIX:
         offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
+      case NODE_SEPARATE_COLOR:
+        svm_node_separate_color(kg, sd, stack, node.y, node.z, node.w);
+        break;
+      case NODE_COMBINE_COLOR:
+        svm_node_combine_color(kg, sd, stack, node.y, node.z, node.w);
+        break;
       case NODE_SEPARATE_VECTOR:
         svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
         break;
diff --git a/intern/cycles/kernel/svm/types.h b/intern/cycles/kernel/svm/types.h
index bede58f7a54..82109ec4c4f 100644
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@@ -92,6 +92,8 @@ typedef enum ShaderNodeType {
   NODE_NORMAL_MAP,
   NODE_INVERT,
   NODE_MIX,
+  NODE_SEPARATE_COLOR,
+  NODE_COMBINE_COLOR,
   NODE_SEPARATE_VECTOR,
   NODE_COMBINE_VECTOR,
   NODE_SEPARATE_HSV,
@@ -487,6 +489,12 @@ typedef enum NodePrincipledHairParametrization {
   NODE_PRINCIPLED_HAIR_NUM,
 } NodePrincipledHairParametrization;
 
+typedef enum NodeCombSepColorType {
+  NODE_COMBSEP_COLOR_RGB,
+  NODE_COMBSEP_COLOR_HSV,
+  NODE_COMBSEP_COLOR_HSL,
+} NodeCombSepColorType;
+
 /* Closure */
 
 typedef enum ClosureType {
diff --git a/intern/cycles/scene/image.cpp b/intern/cycles/scene/image.cpp
index 21fde88915e..c61ad1f1d71 100644
--- a/intern/cycles/scene/image.cpp
+++ b/intern/cycles/scene/image.cpp
@@ -117,12 +117,12 @@ void ImageHandle::clear()
   manager = NULL;
 }
 
-bool ImageHandle::empty()
+bool ImageHandle::empty() const
 {
   return tile_slots.empty();
 }
 
-int ImageHandle::num_tiles()
+int ImageHandle::num_tiles() const
 {
   return tile_slots.size();
 }
@@ -154,6 +154,35 @@ int ImageHandle::svm_slot(const int tile_index) const
   return tile_slots[tile_index];
 }
 
+vector<int4> ImageHandle::get_svm_slots() const
+{
+  const size_t num_nodes = divide_up(tile_slots.size(), 2);
+
+  vector<int4> svm_slots;
+  svm_slots.reserve(num_nodes);
+  for (size_t i = 0; i < num_nodes; i++) {
+    int4 node;
+
+    int slot = tile_slots[2 * i];
+    node.x = manager->images[slot]->loader->get_tile_number();
+    node.y = slot;
+
+    if ((2 * i + 1) < tile_slots.size()) {
+      slot = tile_slots[2 * i + 1];
+      node.z = manager->images[slot]->loader->get_tile_number();
+      node.w = slot;
+    }
+    else {
+      node.z = -1;
+      node.w = -1;
+    }
+
+    svm_slots.push_back(node);
+  }
+
+  return svm_slots;
+}
+
 device_texture *ImageHandle::image_memory(const int tile_index) const
 {
   if (tile_index >= tile_slots.size()) {
@@ -266,6 +295,11 @@ ustring ImageLoader::osl_filepath() const
   return ustring();
 }
 
+int ImageLoader::get_tile_number() const
+{
+  return 0;
+}
+
 bool ImageLoader::equals(const ImageLoader *a, const ImageLoader *b)
 {
   if (a == NULL && b == NULL) {
@@ -397,6 +431,19 @@ ImageHandle ImageManager::add_image(ImageLoader *loader,
   return handle;
 }
 
+ImageHandle ImageManager::add_image(const vector<ImageLoader *> &loaders,
+                                    const ImageParams &params)
+{
+  ImageHandle handle;
+  for (ImageLoader *loader : loaders) {
+    const int slot = add_image_slot(loader, params, true);
+    handle.tile_slots.push_back(slot);
+  }
+
+  handle.manager = this;
+  return handle;
+}
+
 int ImageManager::add_image_slot(ImageLoader *loader,
                                  const ImageParams &params,
                                  const bool builtin)
diff --git a/intern/cycles/scene/image.h b/intern/cycles/scene/image.h
index 4d0dee35eca..9edb6a7eaf5 100644
--- a/intern/cycles/scene/image.h
+++ b/intern/cycles/scene/image.h
@@ -112,6 +112,9 @@ class ImageLoader {
   /* Optional for OSL texture cache. */
   virtual ustring osl_filepath() const;
 
+  /* Optional for tiled textures loaded externally. */
+  virtual int get_tile_number() const;
+
   /* Free any memory used for loading metadata and pixels. */
   virtual void cleanup(){};
 
@@ -139,11 +142,12 @@ class ImageHandle {
 
   void clear();
 
-  bool empty();
-  int num_tiles();
+  bool empty() const;
+  int num_tiles() const;
 
   ImageMetaData metadata();
   int svm_slot(const int tile_index = 0) const;
+  vector<int4> get_svm_slots() const;
   device_texture *image_memory(const int tile_index = 0) const;
 
   VDBImageLoader *vdb_loader(const int tile_index = 0) const;
@@ -169,6 +173,7 @@ class ImageManager {
                         const ImageParams &params,
                         const array<int> &tiles);
   ImageHandle add_image(ImageLoader *loader, const ImageParams &params, const bool builtin = true);
+  ImageHandle add_image(const vector<ImageLoader *> &loaders, const ImageParams &params);
 
   void device_update(Device *device, Scene *scene, Progress &progress);
   void device_update_slot(Device *device, Scene *scene, int slot, Progress *progress);
diff --git a/intern/cycles/scene/osl.cpp b/intern/cycles/scene/osl.cpp
index ffa1a2f5623..6698e6e2cce 100644
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -1211,14 +1211,15 @@ void OSLCompiler::parameter_texture(const char *name, ustring filename, ustring
   parameter(name, filename);
 }
 
-void OSLCompiler::parameter_texture(const char *name, int svm_slot)
+void OSLCompiler::parameter_texture(const char *name, const ImageHandle &handle)
 {
   /* Texture loaded through SVM image texture system. We generate a unique
    * name, which ends up being used in OSLRenderServices::get_texture_handle
    * to get handle again. Note that this name must be unique between multiple
    * render sessions as the render services are shared. */
   ustring filename(string_printf("@svm%d", texture_shared_unique_id++).c_str());
-  services->textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::SVM, svm_slot));
+  services->textures.insert(filename,
+                            new OSLTextureHandle(OSLTextureHandle::SVM, handle.get_svm_slots()));
   parameter(name, filename);
 }
 
@@ -1290,7 +1291,7 @@ void OSLCompiler::parameter_texture(const char * /* name */,
 {
 }
 
-void OSLCompiler::parameter_texture(const char * /* name */, int /* svm_slot */)
+void OSLCompiler::parameter_texture(const char * /* name */, const ImageHandle & /*handle*/)
 {
 }
 
diff --git a/intern/cycles/scene/osl.h b/intern/cycles/scene/osl.h
index f0f97dbcaad..bf27069b1b1 100644
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -147,7 +147,7 @@ class OSLCompiler {
   void parameter_attribute(const char *name, ustring s);
 
   void parameter_texture(const char *name, ustring filename, ustring colorspace);
-  void parameter_texture(const char *name, int svm_slot);
+  void parameter_texture(const char *name, const ImageHandle &handle);
   void parameter_texture_ies(const char *name, int svm_slot);
 
   ShaderType output_type()
diff --git a/intern/cycles/scene/shader_nodes.cpp b/intern/cycles/scene/shader_nodes.cpp
index 95fccf725f3..03c152928d5 100644
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -19,7 +19,6 @@
 #include "util/color.h"
 #include "util/foreach.h"
 #include "util/log.h"
-#include "util/string.h"
 #include "util/transform.h"
 
 #include "kernel/tables.h"
@@ -450,22 +449,19 @@ void ImageTextureNode::compile(OSLCompiler &compiler)
   const ustring known_colorspace = metadata.colorspace;
 
   if (handle.svm_slot() == -1) {
-    /* OIIO currently does not support <UVTILE> substitutions natively. Replace with a format they
-     * understand. */
-    std::string osl_filename = filename.string();
-    string_replace(osl_filename, "<UVTILE>", "<U>_<V>");
     compiler.parameter_texture(
-        "filename", ustring(osl_filename), compress_as_srgb ? u_colorspace_raw : known_colorspace);
+        "filename", filename, compress_as_srgb ? u_colorspace_raw : known_colorspace);
   }
   else {
-    compiler.parameter_texture("filename", handle.svm_slot());
+    compiler.parameter_texture("filename", handle);
   }
 
   const bool unassociate_alpha = !(ColorSpaceManager::colorspace_is_data(colorspace) ||
                                    alpha_type == IMAGE_ALPHA_CHANNEL_PACKED ||
                                    alpha_type == IMAGE_ALPHA_IGNORE);
   const bool is_tiled = (filename.find("<UDIM>") != string::npos ||
-                         filename.find("<UVTILE>") != string::npos);
+                         filename.find("<UVTILE>") != string::npos) ||
+                        handle.num_tiles() > 1;
 
   compiler.parameter(this, "projection");
   compiler.parameter(this, "projection_blend");
@@ -610,7 +606,7 @@ void EnvironmentTextureNode::compile(OSLCompiler &compiler)
         "filename", filename, compress_as_srgb ? u_colorspace_raw : known_colorspace);
   }
   else {
-    compiler.parameter_texture("filename", handle.svm_slot());
+    compiler.parameter_texture("filename", handle);
   }
 
   compiler.parameter(this, "projection");
@@ -965,7 +961,7 @@ void SkyTextureNode::compile(OSLCompiler &compiler)
   compiler.parameter_array("nishita_data", sunsky.nishita_data, 10);
   /* nishita texture */
   if (sky_type == NODE_SKY_NISHITA) {
-    compiler.parameter_texture("filename", handle.svm_slot());
+    compiler.parameter_texture("filename", handle);
   }
   compiler.add(this, "node_sky_texture");
 }
@@ -1860,7 +1856,7 @@ void PointDensityTextureNode::compile(OSLCompiler &compiler)
       handle = image_manager->add_image(filename.string(), image_params());
     }
 
-    compiler.parameter_texture("filename", handle.svm_slot());
+    compiler.parameter_texture("filename", handle);
     if (space == NODE_TEX_VOXEL_SPACE_WORLD) {
       compiler.parameter("mapping", tfm);
       compiler.parameter("use_mapping", 1);
@@ -5010,6 +5006,63 @@ void MixNode::constant_fold(const ConstantFolder &folder)
   }
 }
 
+/* Combine Color */
+
+NODE_DEFINE(CombineColorNode)
+{
+  NodeType *type = NodeType::add("combine_color", create, NodeType::SHADER);
+
+  static NodeEnum type_enum;
+  type_enum.insert("rgb", NODE_COMBSEP_COLOR_RGB);
+  type_enum.insert("hsv", NODE_COMBSEP_COLOR_HSV);
+  type_enum.insert("hsl", NODE_COMBSEP_COLOR_HSL);
+  SOCKET_ENUM(color_type, "Type", type_enum, NODE_COMBSEP_COLOR_RGB);
+
+  SOCKET_IN_FLOAT(r, "Red", 0.0f);
+  SOCKET_IN_FLOAT(g, "Green", 0.0f);
+  SOCKET_IN_FLOAT(b, "Blue", 0.0f);
+
+  SOCKET_OUT_COLOR(color, "Color");
+
+  return type;
+}
+
+CombineColorNode::CombineColorNode() : ShaderNode(get_node_type())
+{
+}
+
+void CombineColorNode::constant_fold(const ConstantFolder &folder)
+{
+  if (folder.all_inputs_constant()) {
+    folder.make_constant(svm_combine_color(color_type, make_float3(r, g, b)));
+  }
+}
+
+void CombineColorNode::compile(SVMCompiler &compiler)
+{
+  ShaderInput *red_in = input("Red");
+  ShaderInput *green_in = input("Green");
+  ShaderInput *blue_in = input("Blue");
+  ShaderOutput *color_out = output("Color");
+
+  int red_stack_offset = compiler.stack_assign(red_in);
+  int green_stack_offset = compiler.stack_assign(green_in);
+  int blue_stack_offset = compiler.stack_assign(blue_in);
+  int color_stack_offset = compiler.stack_assign(color_out);
+
+  compiler.add_node(
+      NODE_COMBINE_COLOR,
+      color_type,
+      compiler.encode_uchar4(red_stack_offset, green_stack_offset, blue_stack_offset),
+      color_stack_offset);
+}
+
+void CombineColorNode::compile(OSLCompiler &compiler)
+{
+  compiler.parameter(this, "color_type");
+  compiler.add(this, "node_combine_color");
+}
+
 /* Combine RGB */
 
 NODE_DEFINE(CombineRGBNode)
@@ -5250,6 +5303,70 @@ void BrightContrastNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_brightness");
 }
 
+/* Separate Color */
+
+NODE_DEFINE(SeparateColorNode)
+{
+  NodeType *type = NodeType::add("separate_color", create, NodeType::SHADER);
+
+  static NodeEnum type_enum;
+  type_enum.insert("rgb", NODE_COMBSEP_COLOR_RGB);
+  type_enum.insert("hsv", NODE_COMBSEP_COLOR_HSV);
+  type_enum.insert("hsl", NODE_COMBSEP_COLOR_HSL);
+  SOCKET_ENUM(color_type, "Type", type_enum, NODE_COMBSEP_COLOR_RGB);
+
+  SOCKET_IN_COLOR(color, "Color", zero_float3());
+
+  SOCKET_OUT_FLOAT(r, "Red");
+  SOCKET_OUT_FLOAT(g, "Green");
+  SOCKET_OUT_FLOAT(b, "Blue");
+
+  return type;
+}
+
+SeparateColorNode::SeparateColorNode() : ShaderNode(get_node_type())
+{
+}
+
+void SeparateColorNode::constant_fold(const ConstantFolder &folder)
+{
+  if (folder.all_inputs_constant()) {
+    float3 col = svm_separate_color(color_type, color);
+
+    for (int channel = 0; channel < 3; channel++) {
+      if (outputs[channel] == folder.output) {
+        folder.make_constant(col[channel]);
+        return;
+      }
+    }
+  }
+}
+
+void SeparateColorNode::compile(SVMCompiler &compiler)
+{
+  ShaderInput *color_in = input("Color");
+  ShaderOutput *red_out = output("Red");
+  ShaderOutput *green_out = output("Green");
+  ShaderOutput *blue_out = output("Blue");
+
+  int color_stack_offset = compiler.stack_assign(color_in);
+  int red_stack_offset = compiler.stack_assign(red_out);
+  int green_stack_offset = compiler.stack_assign(green_out);
+  int blue_stack_offset = compiler.stack_assign(blue_out);
+
+  compiler.add_node(
+      NODE_SEPARATE_COLOR,
+      color_type,
+      color_stack_offset,
+      compiler.encode_uchar4(red_stack_offset, green_stack_offset, blue_stack_offset));
+}
+
+void SeparateColorNode::compile(OSLCompiler &compiler)
+{
+  compiler.parameter(this, "color_type");
+  compiler.add(this, "node_separate_color");
+}
+
 /* Separate RGB */
 
 NODE_DEFINE(SeparateRGBNode)
diff --git a/intern/cycles/scene/shader_nodes.h b/intern/cycles/scene/shader_nodes.h
index 9aef5d3151f..ac40a397c1e 100644
--- a/intern/cycles/scene/shader_nodes.h
+++ b/intern/cycles/scene/shader_nodes.h
@@ -1101,6 +1101,17 @@ class MixNode : public ShaderNode {
   NODE_SOCKET_API(float, fac)
 };
 
+class CombineColorNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(CombineColorNode)
+  void constant_fold(const ConstantFolder &folder);
+
+  NODE_SOCKET_API(NodeCombSepColorType, color_type)
+  NODE_SOCKET_API(float, r)
+  NODE_SOCKET_API(float, g)
+  NODE_SOCKET_API(float, b)
+};
+
 class CombineRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineRGBNode)
@@ -1150,6 +1161,15 @@ class BrightContrastNode : public ShaderNode {
   NODE_SOCKET_API(float, contrast)
 };
 
+class SeparateColorNode : public ShaderNode {
+ public:
+  SHADER_NODE_CLASS(SeparateColorNode)
+  void constant_fold(const ConstantFolder &folder);
+
+  NODE_SOCKET_API(NodeCombSepColorType, color_type)
+  NODE_SOCKET_API(float3, color)
+};
+
 class SeparateRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateRGBNode)
diff --git a/intern/cycles/util/color.h b/intern/cycles/util/color.h
index cccccde3ba6..795c3754976 100644
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -152,6 +152,56 @@ ccl_device float3 hsv_to_rgb(float3 hsv)
   return rgb;
 }
 
+ccl_device float3 rgb_to_hsl(float3 rgb)
+{
+  float cmax, cmin, h, s, l;
+
+  cmax = fmaxf(rgb.x, fmaxf(rgb.y, rgb.z));
+  cmin = min(rgb.x, min(rgb.y, rgb.z));
+  l = min(1.0f, (cmax + cmin) / 2.0f);
+
+  if (cmax == cmin) {
+    h = s = 0.0f; /* achromatic */
+  }
+  else {
+    float cdelta = cmax - cmin;
+    s = l > 0.5f ? cdelta / (2.0f - cmax - cmin) : cdelta / (cmax + cmin);
+    if (cmax == rgb.x) {
+      h = (rgb.y - rgb.z) / cdelta + (rgb.y < rgb.z ? 6.0f : 0.0f);
+    }
+    else if (cmax == rgb.y) {
+      h = (rgb.z - rgb.x) / cdelta + 2.0f;
+    }
+    else {
+      h = (rgb.x - rgb.y) / cdelta + 4.0f;
+    }
+  }
+  h /= 6.0f;
+
+  return make_float3(h, s, l);
+}
+
+ccl_device float3 hsl_to_rgb(float3 hsl)
+{
+  float nr, ng, nb, chroma, h, s, l;
+
+  h = hsl.x;
+  s = hsl.y;
+  l = hsl.z;
+
+  nr = fabsf(h * 6.0f - 3.0f) - 1.0f;
+  ng = 2.0f - fabsf(h * 6.0f - 2.0f);
+  nb = 2.0f - fabsf(h * 6.0f - 4.0f);
+
+  nr = clamp(nr, 0.0f, 1.0f);
+  nb = clamp(nb, 0.0f, 1.0f);
+  ng = clamp(ng, 0.0f, 1.0f);
+
+  chroma = (1.0f - fabsf(2.0f * l - 1.0f)) * s;
+
+  return make_float3((nr - 0.5f) * chroma + l, (ng - 0.5f) * chroma + l, (nb - 0.5f) * chroma + l);
+}
+
 ccl_device float3 xyY_to_xyz(float x, float y, float Y)
 {
   float X, Z;