12 files changed, 621 insertions, 160 deletions
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
index 01c021551f3..c9764d1c21b 100644
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -232,7 +232,7 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
   return cflags;
 }
 
-string CUDADevice::compile_kernel(const uint kernel_features,
+string CUDADevice::compile_kernel(const string &common_cflags,
                                   const char *name,
                                   const char *base,
                                   bool force_ptx)
@@ -281,7 +281,6 @@ string CUDADevice::compile_kernel(const uint kernel_features,
   /* We include cflags into md5 so changing cuda toolkit or changing other
    * compiler command line arguments makes sure cubin gets re-built.
    */
-  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
   const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
 
   const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
@@ -417,7 +416,8 @@ bool CUDADevice::load_kernels(const uint kernel_features)
 
   /* get kernel */
   const char *kernel_name = "kernel";
-  string cubin = compile_kernel(kernel_features, kernel_name);
+  string cflags = compile_kernel_get_common_cflags(kernel_features);
+  string cubin = compile_kernel(cflags, kernel_name);
   if (cubin.empty())
     return false;
 
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
index a754c33f79d..c18f2811161 100644
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -77,9 +77,9 @@ class CUDADevice : public Device {
 
   bool use_adaptive_compilation();
 
-  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+  string compile_kernel_get_common_cflags(const uint kernel_features);
 
-  string compile_kernel(const uint kernel_features,
+  string compile_kernel(const string &cflags,
                         const char *name,
                         const char *base = "cuda",
                         bool force_ptx = false);
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2e4d18241cf..06a2f5c7b01 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -160,6 +160,11 @@ class Device {
     return true;
   }
 
+  virtual bool load_osl_kernels()
+  {
+    return true;
+  }
+
   /* GPU device only functions.
    * These may not be used on CPU or multi-devices. */
 
diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h
index 9afef3789af..efdc15dca79 100644
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -74,7 +74,7 @@ class HIPDevice : public Device {
 
   bool use_adaptive_compilation();
 
-  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+  string compile_kernel_get_common_cflags(const uint kernel_features);
 
   string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip");
 
diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp
index 96a99cd62cd..27ca0d81817 100644
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -7,6 +7,30 @@
 
 CCL_NAMESPACE_BEGIN
 
+bool device_kernel_has_shading(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY);
+}
+
+bool device_kernel_has_intersection(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
+}
+
 const char *device_kernel_as_string(DeviceKernel kernel)
 {
   switch (kernel) {
diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h
index 4ae461f1f67..b829a891260 100644
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -11,6 +11,9 @@
 
 CCL_NAMESPACE_BEGIN
 
+bool device_kernel_has_shading(DeviceKernel kernel);
+bool device_kernel_has_intersection(DeviceKernel kernel);
+
 const char *device_kernel_as_string(DeviceKernel kernel);
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
 
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 55938d1a03a..35cf832c537 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
 struct ShaderCache {
   ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
   {
+    /* Initialize occupancy tuning LUT. */
+    if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
+      switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
+        default:
+        case APPLE_M2:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
+          break;
+        case APPLE_M1:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
+          break;
+      }
+    }
   }
   ~ShaderCache();
 
@@ -73,6 +103,11 @@ struct ShaderCache {
     std::function<void(MetalKernelPipeline *)> completionHandler;
   };
 
+  struct OccupancyTuningParameters {
+    int threads_per_threadgroup = 0;
+    int num_threads_per_block = 0;
+  } occupancy_tuning[DEVICE_KERNEL_NUM];
+
   std::mutex cache_mutex;
 
   PipelineCollection pipelines[DEVICE_KERNEL_NUM];
@@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
   request.pipeline->device_kernel = device_kernel;
   request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
 
+  if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
+    request.pipeline->threads_per_threadgroup =
+        occupancy_tuning[device_kernel].threads_per_threadgroup;
+    request.pipeline->num_threads_per_block =
+        occupancy_tuning[device_kernel].num_threads_per_block;
+  }
+
   /* metalrt options */
   request.pipeline->use_metalrt = device->use_metalrt;
   request.pipeline->metalrt_hair = device->use_metalrt &&
@@ -374,13 +416,6 @@ void MetalKernelPipeline::compile()
   const std::string function_name = std::string("cycles_metal_") +
                                     device_kernel_as_string(device_kernel);
 
-  int threads_per_threadgroup = this->threads_per_threadgroup;
-  if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
-      device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
-    /* Always use 512 for the sorting kernels */
-    threads_per_threadgroup = 512;
-  }
-
   NSString *entryPoint = [@(function_name.c_str()) copy];
 
   NSError *error = NULL;
@@ -583,7 +618,9 @@ void MetalKernelPipeline::compile()
     metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
     path_create_directories(metalbin_path);
 
-    if (path_exists(metalbin_path) && use_binary_archive) {
+    /* Retrieve shader binary from disk, and update the file timestamp for LRU purging to work as
+     * intended. */
+    if (use_binary_archive && path_cache_kernel_exists_and_mark_used(metalbin_path)) {
       if (@available(macOS 11.0, *)) {
         MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
         archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
@@ -644,12 +681,14 @@ void MetalKernelPipeline::compile()
       return;
     }
 
-    int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
-                                           computePipelineState.threadExecutionWidth);
-    num_threads_per_block = std::max(num_threads_per_block,
-                                     (int)computePipelineState.threadExecutionWidth);
+    if (!num_threads_per_block) {
+      num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+                                         computePipelineState.threadExecutionWidth);
+      num_threads_per_block = std::max(num_threads_per_block,
+                                       (int)computePipelineState.threadExecutionWidth);
+    }
+
     this->pipeline = computePipelineState;
-    this->num_threads_per_block = num_threads_per_block;
 
     if (@available(macOS 11.0, *)) {
       if (creating_new_archive || recreate_archive) {
@@ -658,6 +697,9 @@ void MetalKernelPipeline::compile()
           metal_printf("Failed to save binary archive, error:\n%s\n",
                        [[error localizedDescription] UTF8String]);
         }
+        else {
+          path_cache_kernel_mark_added_and_clear_old(metalbin_path);
+        }
       }
     }
   };
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index 6904d2c2dc6..9605c6a7538 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -138,6 +138,15 @@ class MultiDevice : public Device {
     return true;
   }
 
+  bool load_osl_kernels() override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_osl_kernels())
+        return false;
+
+    return true;
+  }
+
   void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
     /* Try to build and share a single acceleration structure, if possible */
@@ -204,10 +213,12 @@ class MultiDevice : public Device {
 
   virtual void *get_cpu_osl_memory() override
   {
-    if (devices.size() > 1) {
+    /* Always return the OSL memory of the CPU device (this works since the constructor above
+     * guarantees that CPU devices are always added to the back). */
+    if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) {
       return NULL;
     }
-    return devices.front().device->get_cpu_osl_memory();
+    return devices.back().device->get_cpu_osl_memory();
   }
 
   bool is_resident(device_ptr key, Device *sub_device) override
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
index 68ca21374fd..58b72374a7d 100644
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@@ -9,6 +9,10 @@
 
 #include "util/log.h"
 
+#ifdef WITH_OSL
+#  include <OSL/oslversion.h>
+#endif
+
 #ifdef WITH_OPTIX
 #  include <optix_function_table_definition.h>
 #endif
@@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo
 
     info.type = DEVICE_OPTIX;
     info.id += "_OptiX";
+#  if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1)
+    info.has_osl = true;
+#  endif
     info.denoisers |= DENOISER_OPTIX;
 
     devices.push_back(info);
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 6c64e7106d5..02f34bf3bd0 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -312,16 +312,34 @@ OptiXDevice::~OptiXDevice()
   if (optix_module != NULL) {
     optixModuleDestroy(optix_module);
   }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     if (builtin_modules[i] != NULL) {
       optixModuleDestroy(builtin_modules[i]);
     }
   }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
     if (pipelines[i] != NULL) {
       optixPipelineDestroy(pipelines[i]);
     }
   }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+    }
+  }
+
+#  ifdef WITH_OSL
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+#  endif
 
   /* Make sure denoiser is destroyed before device context! */
   if (denoiser_.optix_denoiser != nullptr) {
@@ -381,13 +399,51 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     return false;
   }
 
+#  ifdef WITH_OSL
+  const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL);
+#  else
+  const bool use_osl = false;
+#  endif
+
+  /* Skip creating OptiX module if only doing denoising. */
+  const bool need_optix_kernels = (kernel_features &
+                                   (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING));
+
+  /* Detect existence of OptiX kernel and SDK here early. So we can error out
+   * before compiling the CUDA kernels, to avoid failing right after when
+   * compiling the OptiX kernel. */
+  string suffix = use_osl ? "_osl" :
+                  (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
+                            "_shader_raytrace" :
+                            "";
+  string ptx_filename;
+  if (need_optix_kernels) {
+    ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx");
+    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+      std::string optix_include_dir = get_optix_include_dir();
+      if (optix_include_dir.empty()) {
+        set_error(
+            "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
+            "to a directory containing the OptiX SDK.");
+        return false;
+      }
+      else if (!path_is_directory(optix_include_dir)) {
+        set_error(string_printf(
+            "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
+            "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
+            "directory containing the OptiX SDK.",
+            optix_include_dir.c_str()));
+        return false;
+      }
+    }
+  }
+
   /* Load CUDA modules because we need some of the utility kernels. */
   if (!CUDADevice::load_kernels(kernel_features)) {
     return false;
   }
 
-  /* Skip creating OptiX module if only doing denoising. */
-  if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+  if (!need_optix_kernels) {
     return true;
   }
 
@@ -398,18 +454,41 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     optixModuleDestroy(optix_module);
     optix_module = NULL;
   }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     if (builtin_modules[i] != NULL) {
       optixModuleDestroy(builtin_modules[i]);
       builtin_modules[i] = NULL;
     }
   }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
     if (pipelines[i] != NULL) {
       optixPipelineDestroy(pipelines[i]);
       pipelines[i] = NULL;
     }
   }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+      groups[i] = NULL;
+    }
+  }
+
+#  ifdef WITH_OSL
+  /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  osl_modules.clear();
+
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+  osl_groups.clear();
+#  endif
 
   OptixModuleCompileOptions module_options = {};
   module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
@@ -430,7 +509,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   module_options.numPayloadTypes = 0;
 #  endif
 
-  OptixPipelineCompileOptions pipeline_options = {};
   /* Default to no motion blur and two-level graph, since it is the fastest option. */
   pipeline_options.usesMotionBlur = false;
   pipeline_options.traversableGraphFlags =
@@ -459,9 +537,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
    * This is necessary since objects may be reported to have motion if the Vector pass is
    * active, but may still need to be rendered without motion blur if that isn't active as well. */
-  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
-
-  if (motion_blur) {
+  if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
     pipeline_options.usesMotionBlur = true;
     /* Motion blur can insert motion transforms into the traversal graph.
      * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
@@ -469,33 +545,10 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   }
 
   { /* Load and compile PTX module with OptiX kernels. */
-    string ptx_data, ptx_filename = path_get(
-                         (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-                             "lib/kernel_optix_shader_raytrace.ptx" :
-                             "lib/kernel_optix.ptx");
+    string ptx_data;
     if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
-      std::string optix_include_dir = get_optix_include_dir();
-      if (optix_include_dir.empty()) {
-        set_error(
-            "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
-            "to a directory containing the OptiX SDK.");
-        return false;
-      }
-      else if (!path_is_directory(optix_include_dir)) {
-        set_error(string_printf(
-            "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
-            "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
-            "directory containing the OptiX SDK.",
-            optix_include_dir.c_str()));
-        return false;
-      }
-      ptx_filename = compile_kernel(
-          kernel_features,
-          (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-              "kernel_shader_raytrace" :
-              "kernel",
-          "optix",
-          true);
+      string cflags = compile_kernel_get_common_cflags(kernel_features);
+      ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
     }
     if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
       set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
@@ -537,7 +590,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   }
 
   /* Create program groups. */
-  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
   group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
@@ -595,7 +647,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
       group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
 
-      if (motion_blur) {
+      if (pipeline_options.usesMotionBlur) {
         builtin_options.usesMotionBlur = true;
 
         optix_assert(optixBuiltinISModuleGet(
@@ -616,7 +668,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     }
   }
 
-  /* Pointclouds */
   if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
     group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
     group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
@@ -628,8 +679,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
   }
 
+  /* Add hit group for local intersections. */
   if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
-    /* Add hit group for local intersections. */
     group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
     group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
     group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
@@ -641,16 +692,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
     group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
         "__raygen__kernel_optix_integrator_shade_surface_raytrace";
-    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
-    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
-        "__direct_callable__svm_node_bevel";
+
+    /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */
+    if (!use_osl) {
+      group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+      group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+          "__direct_callable__svm_node_bevel";
+    }
   }
 
-  /* MNEE. */
   if (kernel_features & KERNEL_FEATURE_MNEE) {
     group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
     group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
@@ -658,6 +712,42 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
         "__raygen__kernel_optix_integrator_shade_surface_mnee";
   }
 
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl) {
+    group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_background";
+    group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_light";
+    group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface";
+    group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_volume";
+    group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_shadow";
+    group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_displace";
+    group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_background";
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
+  }
+
   optix_assert(optixProgramGroupCreate(
       context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
 
@@ -666,7 +756,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Set up SBT, which in this case is used only to select between different programs. */
   sbt_data.alloc(NUM_PROGRAM_GROUPS);
   memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
     optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
     optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
   }
@@ -690,25 +780,26 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
 
   OptixPipelineLinkOptions link_options = {};
   link_options.maxTraceDepth = 1;
+  link_options.debugLevel = module_options.debugLevel;
 
-  if (DebugFlags().optix.use_debug) {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
-  }
-  else {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
-  }
-
-  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    /* Create shader raytracing pipeline. */
+  if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE) && !use_osl) {
+    /* Create shader raytracing and MNEE pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+    }
+    if (kernel_features & KERNEL_FEATURE_MNEE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    }
     pipeline_groups.push_back(groups[PG_MISS]);
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
     pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
@@ -716,8 +807,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
       pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
     }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
 
     optix_assert(optixPipelineCreate(context,
                                      &pipeline_options,
@@ -726,30 +815,33 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_SHADE_RAYTRACE]));
+                                     &pipelines[PIP_SHADE]));
 
     /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
                              link_options.maxTraceDepth * trace_css;
     const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
                                       stack_size[PG_CALL_SVM_BEVEL].dssDC);
 
     /* Set stack size depending on pipeline options. */
     optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  if (kernel_features & KERNEL_FEATURE_MNEE) {
-    /* Create MNEE pipeline. */
+  { /* Create intersection-only pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
     pipeline_groups.push_back(groups[PG_MISS]);
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
     pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
@@ -757,8 +849,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
       pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
     }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
 
     optix_assert(optixPipelineCreate(context,
                                      &pipeline_options,
@@ -767,37 +857,234 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_SHADE_MNEE]));
+                                     &pipelines[PIP_INTERSECT]));
 
-    /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG +
-                             link_options.maxTraceDepth * trace_css;
-    const unsigned int dss = 0;
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
 
-    /* Set stack size depending on pipeline options. */
-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2));
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  { /* Create intersection-only pipeline. */
+  return !have_error();
+}
+
+bool OptiXDevice::load_osl_kernels()
+{
+#  ifdef WITH_OSL
+  if (have_error()) {
+    return false;
+  }
+
+  struct OSLKernel {
+    string ptx;
+    string init_entry;
+    string exec_entry;
+  };
+
+  /* This has to be in the same order as the ShaderType enum, so that the index calculation in
+   * osl_eval_nodes checks out */
+  vector<OSLKernel> osl_kernels;
+
+  for (ShaderType type = SHADER_TYPE_SURFACE; type <= SHADER_TYPE_BUMP;
+       type = static_cast<ShaderType>(type + 1)) {
+    const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ?
+                                                     osl_globals.surface_state :
+                                                 type == SHADER_TYPE_VOLUME ?
+                                                     osl_globals.volume_state :
+                                                 type == SHADER_TYPE_DISPLACEMENT ?
+                                                     osl_globals.displacement_state :
+                                                     osl_globals.bump_state);
+    for (const OSL::ShaderGroupRef &group : groups) {
+      if (group) {
+        string osl_ptx, init_name, entry_name;
+        osl_globals.ss->getattribute(group.get(), "group_init_name", init_name);
+        osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name);
+        osl_globals.ss->getattribute(
+            group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
+
+        int groupdata_size = 0;
+        osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
+        if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
+          set_error(
+              string_printf("Requested OSL group data size (%d) is greater than the maximum "
+                            "supported with OptiX (2048)",
+                            groupdata_size));
+          return false;
+        }
+
+        osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)});
+      }
+      else {
+        /* Add empty entry for non-existent shader groups, so that the index stays stable. */
+        osl_kernels.emplace_back();
+      }
+    }
+  }
+
+  const CUDAContextScope scope(this);
+
+  if (pipelines[PIP_SHADE]) {
+    optixPipelineDestroy(pipelines[PIP_SHADE]);
+  }
+
+  for (OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+      module = NULL;
+    }
+  }
+  for (OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+      group = NULL;
+    }
+  }
+
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  OptixModuleCompileOptions module_options = {};
+  module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+  module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  osl_groups.resize(osl_kernels.size() * 2 + 1);
+  osl_modules.resize(osl_kernels.size() + 1);
+
+  { /* Load and compile PTX module with OSL services. */
+    string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx");
+    if (!path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
+                              ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &osl_modules.back());
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_desc = {};
+    group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
+    group_desc.callables.moduleDC = osl_modules.back();
+
+    optix_assert(optixProgramGroupCreate(
+        context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back()));
+  }
+
+  TaskPool pool;
+  vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+#    if OPTIX_ABI_VERSION >= 55
+    OptixTask task = nullptr;
+    results[i] = optixModuleCreateFromPTXWithTasks(context,
+                                                   &module_options,
+                                                   &pipeline_options,
+                                                   osl_kernels[i].ptx.data(),
+                                                   osl_kernels[i].ptx.size(),
+                                                   nullptr,
+                                                   nullptr,
+                                                   &osl_modules[i],
+                                                   &task);
+    if (results[i] == OPTIX_SUCCESS) {
+      execute_optix_task(pool, task, results[i]);
+    }
+#    else
+    pool.push([this, &results, i, &module_options, &osl_kernels]() {
+      results[i] = optixModuleCreateFromPTX(context,
+                                            &module_options,
+                                            &pipeline_options,
+                                            osl_kernels[i].ptx.data(),
+                                            osl_kernels[i].ptx.size(),
+                                            nullptr,
+                                            0,
+                                            &osl_modules[i]);
+    });
+#    endif
+  }
+
+  pool.wait_work();
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+    if (results[i] != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
+                              osl_kernels[i].init_entry.c_str(),
+                              optixGetErrorName(results[i])));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_descs[2] = {};
+    group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str();
+    group_descs[0].callables.moduleDC = osl_modules[i];
+    group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str();
+    group_descs[1].callables.moduleDC = osl_modules[i];
+
+    optix_assert(optixProgramGroupCreate(
+        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
+  }
+
+  vector<OptixStackSizes> osl_stack_size(osl_groups.size());
+
+  /* Update SBT with new entries. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+  }
+  for (size_t i = 0; i < osl_groups.size(); ++i) {
+    if (osl_groups[i] != NULL) {
+      optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
+      optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i]));
+    }
+  }
+  sbt_data.copy_to_device(); /* Upload updated SBT to device. */
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 0;
+  link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  {
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
-    pipeline_groups.push_back(groups[PG_MISS]);
-    pipeline_groups.push_back(groups[PG_HITD]);
-    pipeline_groups.push_back(groups[PG_HITS]);
-    pipeline_groups.push_back(groups[PG_HITL]);
-    pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
-      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-    }
-    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
-      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
-      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
+
+    for (const OptixProgramGroup &group : osl_groups) {
+      if (group != NULL) {
+        pipeline_groups.push_back(group);
+      }
     }
 
     optix_assert(optixPipelineCreate(context,
@@ -807,26 +1094,30 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_INTERSECT]));
+                                     &pipelines[PIP_SHADE]));
 
-    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
-    const unsigned int css =
-        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
-                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
-                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
-                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
-        link_options.maxTraceDepth * trace_css;
+    unsigned int dss = 0;
+    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
+      dss = std::max(dss, osl_stack_size[i].dssDC);
+    }
 
-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  /* Clean up program group objects. */
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-    optixProgramGroupDestroy(groups[i]);
-  }
+  return !have_error();
+#  else
+  return false;
+#  endif
+}
 
-  return true;
+void *OptiXDevice::get_cpu_osl_memory()
+{
+#  ifdef WITH_OSL
+  return &osl_globals;
+#  else
+  return NULL;
+#  endif
 }
 
 /* --------------------------------------------------------------------
@@ -1553,7 +1844,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
         num_motion_steps = hair->get_motion_steps();
       }
 
@@ -1707,7 +1998,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
         num_motion_steps = mesh->get_motion_steps();
       }
 
@@ -1774,7 +2065,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) {
+      if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
         num_motion_steps = pointcloud->get_motion_steps();
       }
 
@@ -1871,7 +2162,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
     /* Calculate total motion transform size and allocate memory for them. */
     size_t motion_transform_offset = 0;
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       size_t total_motion_transform_size = 0;
       for (Object *const ob : bvh->objects) {
         if (ob->is_traceable() && ob->use_motion()) {
@@ -1922,7 +2213,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
           static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-        if (motion_blur && ob->get_geometry()->has_motion_blur()) {
+        if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
           /* Select between motion blur and non-motion blur built-in intersection module. */
           instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
         }
@@ -1950,7 +2241,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       }
 
       /* Insert motion traversable if object has motion. */
-      if (motion_blur && ob->use_motion()) {
+      if (pipeline_options.usesMotionBlur && ob->use_motion()) {
         size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
         size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
                                        motion_keys * sizeof(OptixSRTData);
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 817afdc8384..ad0e7b93454 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -9,6 +9,7 @@
 #  include "device/cuda/device_impl.h"
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
+#  include "kernel/osl/globals.h"
 #  include "kernel/types.h"
 #  include "util/unique_ptr.h"
 
@@ -23,8 +24,16 @@ enum {
   PG_RGEN_INTERSECT_SHADOW,
   PG_RGEN_INTERSECT_SUBSURFACE,
   PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_BACKGROUND,
+  PG_RGEN_SHADE_LIGHT,
+  PG_RGEN_SHADE_SURFACE,
   PG_RGEN_SHADE_SURFACE_RAYTRACE,
   PG_RGEN_SHADE_SURFACE_MNEE,
+  PG_RGEN_SHADE_VOLUME,
+  PG_RGEN_SHADE_SHADOW,
+  PG_RGEN_EVAL_DISPLACE,
+  PG_RGEN_EVAL_BACKGROUND,
+  PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY,
   PG_MISS,
   PG_HITD, /* Default hit group. */
   PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@@ -40,14 +49,14 @@ enum {
 };
 
 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
-static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int NUM_MISS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
 static const int NUM_HIT_PROGRAM_GROUPS = 8;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;
 
 /* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES };
 
 /* A single shader binding table entry. */
 struct SbtRecord {
@@ -61,12 +70,20 @@ class OptiXDevice : public CUDADevice {
   OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
   OptixModule builtin_modules[2] = {};
   OptixPipeline pipelines[NUM_PIPELINES] = {};
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixPipelineCompileOptions pipeline_options = {};
 
-  bool motion_blur = false;
   device_vector<SbtRecord> sbt_data;
   device_only_memory<KernelParamsOptiX> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
 
+#  ifdef WITH_OSL
+  OSLGlobals osl_globals;
+  vector<OptixModule> osl_modules;
+  vector<OptixProgramGroup> osl_groups;
+#  endif
+
+ private:
+  OptixTraversableHandle tlas_handle = 0;
   vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
   thread_mutex delayed_free_bvh_mutex;
 
@@ -100,13 +117,14 @@ class OptiXDevice : public CUDADevice {
   OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
   ~OptiXDevice();
 
- private:
   BVHLayoutMask get_bvh_layout_mask() const override;
 
-  string compile_kernel_get_common_cflags(const uint kernel_features) override;
+  string compile_kernel_get_common_cflags(const uint kernel_features);
 
   bool load_kernels(const uint kernel_features) override;
 
+  bool load_osl_kernels() override;
+
   bool build_optix_bvh(BVHOptiX *bvh,
                        OptixBuildOperation operation,
                        const OptixBuildInput &build_input,
@@ -123,6 +141,8 @@ class OptiXDevice : public CUDADevice {
 
   virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
 
+  void *get_cpu_osl_memory() override;
+
   /* --------------------------------------------------------------------
    * Denoising.
    */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 3bc547ed11d..1bfd154d449 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution()
   CUDADeviceQueue::init_execution();
 }
 
-static bool is_optix_specific_kernel(DeviceKernel kernel)
+static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl)
 {
-  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+#  ifdef WITH_OSL
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl && device_kernel_has_shading(kernel)) {
+    return true;
+  }
+#  else
+  (void)use_osl;
+#  endif
+
+  return device_kernel_has_intersection(kernel);
 }
 
 bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                                const int work_size,
                                DeviceKernelArguments const &args)
 {
-  if (!is_optix_specific_kernel(kernel)) {
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+#  ifdef WITH_OSL
+  const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use;
+#  else
+  const bool use_osl = false;
+#  endif
+
+  if (!is_optix_specific_kernel(kernel, use_osl)) {
     return CUDADeviceQueue::enqueue(kernel, work_size, args);
   }
 
@@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
 
   const CUDAContextScope scope(cuda_device_);
 
-  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
-
   const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
   const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
 
@@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                         sizeof(device_ptr),
                         cuda_stream_));
 
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) {
     cuda_device_assert(
         cuda_device_,
         cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                           sizeof(device_ptr),
                           cuda_stream_));
   }
+  if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) {
+    cuda_device_assert(cuda_device_,
+                       cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset),
+                                         args.values[2],  // &d_offset
+                                         sizeof(int32_t),
+                                         cuda_stream_));
+  }
 
   cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
 
@@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
   OptixShaderBindingTable sbt_params = {};
 
   switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord);
+      break;
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
-      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
       break;
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
-      pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
       break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord);
+      break;
+
     case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
       pipeline = optix_device->pipelines[PIP_INTERSECT];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
       break;
 
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr +
+                                PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord);
+      break;
+
     default:
       LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
                  << " is attempted to be enqueued.";
@@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
 
   sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
   sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS;
   sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
   sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
   sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
@@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
   sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
   sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
 
+#  ifdef WITH_OSL
+  if (use_osl) {
+    sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size());
+  }
+#  endif
+
   /* Launch the ray generation program. */
   optix_device_assert(optix_device,
                       optixLaunch(pipeline,