43 files changed, 3478 insertions, 564 deletions
diff --git a/build_files/cmake/platform/platform_win32.cmake b/build_files/cmake/platform/platform_win32.cmake
index 7a2d3ad948a..47673794652 100644
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -419,7 +419,7 @@ if(WITH_IMAGE_OPENEXR)
     warn_hardcoded_paths(OpenEXR)
     set(OPENEXR ${LIBDIR}/openexr)
     set(OPENEXR_INCLUDE_DIR ${OPENEXR}/include)
-    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR}/include/OpenEXR)
+    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR_INCLUDE_DIR}/OpenEXR)
     set(OPENEXR_LIBPATH ${OPENEXR}/lib)
     # Check if the 3.x library name exists
     # if not assume this is a 2.x library folder
@@ -568,7 +568,8 @@ if(WITH_OPENIMAGEIO)
   if(NOT OpenImageIO_FOUND)
     set(OPENIMAGEIO ${LIBDIR}/OpenImageIO)
     set(OPENIMAGEIO_LIBPATH ${OPENIMAGEIO}/lib)
-    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR})
     set(OIIO_OPTIMIZED optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO.lib optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util.lib)
     set(OIIO_DEBUG debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_d.lib debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util_d.lib)
     set(OPENIMAGEIO_LIBRARIES ${OIIO_OPTIMIZED} ${OIIO_DEBUG})
@@ -785,6 +786,14 @@ if(WITH_CYCLES AND WITH_CYCLES_OSL)
   endif()
   find_path(OSL_INCLUDE_DIR OSL/oslclosure.h PATHS ${CYCLES_OSL}/include)
   find_program(OSL_COMPILER NAMES oslc PATHS ${CYCLES_OSL}/bin)
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MAJOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+[0-9]+.*$")
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MINOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+[0-9]+.*$")
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MAJOR ${OSL_LIBRARY_VERSION_MAJOR})
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MINOR ${OSL_LIBRARY_VERSION_MINOR})
 endif()
 
 if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 05f27bdbd4d..354c9c23a53 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -58,7 +58,7 @@ class CyclesRender(bpy.types.RenderEngine):
         if not self.session:
             if self.is_preview:
                 cscene = bpy.context.scene.cycles
-                use_osl = cscene.shading_system and cscene.device == 'CPU'
+                use_osl = cscene.shading_system
 
                 engine.create(self, data, preview_osl=use_osl)
             else:
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e33891fa7a2..83dc6332f47 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -155,6 +155,10 @@ def with_osl():
     import _cycles
     return _cycles.with_osl
 
+def osl_version():
+    import _cycles
+    return _cycles.osl_version
+
 
 def with_path_guiding():
     import _cycles
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index f5cd88f6b6a..9d7c71417f2 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -290,7 +290,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     )
     shading_system: BoolProperty(
         name="Open Shading Language",
-        description="Use Open Shading Language (CPU rendering only)",
+        description="Use Open Shading Language",
     )
 
     preview_pause: BoolProperty(
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 305accc8f1a..11fa2bc62fb 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -2305,7 +2305,7 @@ def draw_device(self, context):
         col.prop(cscene, "device")
 
         from . import engine
-        if engine.with_osl() and use_cpu(context):
+        if engine.with_osl() and (use_cpu(context) or (use_optix(context) and (engine.osl_version()[1] >= 13 or engine.osl_version()[0] > 1))):
             col.prop(cscene, "shading_system")
 
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2e4d18241cf..06a2f5c7b01 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -160,6 +160,11 @@ class Device {
     return true;
   }
 
+  virtual bool load_osl_kernels()
+  {
+    return true;
+  }
+
   /* GPU device only functions.
    * These may not be used on CPU or multi-devices. */
 
diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp
index 96a99cd62cd..27ca0d81817 100644
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -7,6 +7,30 @@
 
 CCL_NAMESPACE_BEGIN
 
+bool device_kernel_has_shading(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY);
+}
+
+bool device_kernel_has_intersection(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
+}
+
 const char *device_kernel_as_string(DeviceKernel kernel)
 {
   switch (kernel) {
diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h
index 4ae461f1f67..b829a891260 100644
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -11,6 +11,9 @@
 
 CCL_NAMESPACE_BEGIN
 
+bool device_kernel_has_shading(DeviceKernel kernel);
+bool device_kernel_has_intersection(DeviceKernel kernel);
+
 const char *device_kernel_as_string(DeviceKernel kernel);
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
 
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index 6904d2c2dc6..9605c6a7538 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -138,6 +138,15 @@ class MultiDevice : public Device {
     return true;
   }
 
+  bool load_osl_kernels() override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_osl_kernels())
+        return false;
+
+    return true;
+  }
+
   void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
     /* Try to build and share a single acceleration structure, if possible */
@@ -204,10 +213,12 @@ class MultiDevice : public Device {
 
   virtual void *get_cpu_osl_memory() override
   {
-    if (devices.size() > 1) {
+    /* Always return the OSL memory of the CPU device (this works since the constructor above
+     * guarantees that CPU devices are always added to the back). */
+    if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) {
       return NULL;
     }
-    return devices.front().device->get_cpu_osl_memory();
+    return devices.back().device->get_cpu_osl_memory();
   }
 
   bool is_resident(device_ptr key, Device *sub_device) override
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
index 68ca21374fd..58b72374a7d 100644
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@@ -9,6 +9,10 @@
 
 #include "util/log.h"
 
+#ifdef WITH_OSL
+#  include <OSL/oslversion.h>
+#endif
+
 #ifdef WITH_OPTIX
 #  include <optix_function_table_definition.h>
 #endif
@@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo
 
     info.type = DEVICE_OPTIX;
     info.id += "_OptiX";
+#  if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1)
+    info.has_osl = true;
+#  endif
     info.denoisers |= DENOISER_OPTIX;
 
     devices.push_back(info);
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index fabf4d7b69d..02f34bf3bd0 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -312,16 +312,34 @@ OptiXDevice::~OptiXDevice()
   if (optix_module != NULL) {
     optixModuleDestroy(optix_module);
   }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     if (builtin_modules[i] != NULL) {
       optixModuleDestroy(builtin_modules[i]);
     }
   }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
     if (pipelines[i] != NULL) {
       optixPipelineDestroy(pipelines[i]);
     }
   }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+    }
+  }
+
+#  ifdef WITH_OSL
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+#  endif
 
   /* Make sure denoiser is destroyed before device context! */
   if (denoiser_.optix_denoiser != nullptr) {
@@ -381,6 +399,12 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     return false;
   }
 
+#  ifdef WITH_OSL
+  const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL);
+#  else
+  const bool use_osl = false;
+#  endif
+
   /* Skip creating OptiX module if only doing denoising. */
   const bool need_optix_kernels = (kernel_features &
                                    (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING));
@@ -388,12 +412,13 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Detect existence of OptiX kernel and SDK here early. So we can error out
    * before compiling the CUDA kernels, to avoid failing right after when
    * compiling the OptiX kernel. */
+  string suffix = use_osl ? "_osl" :
+                  (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
+                            "_shader_raytrace" :
+                            "";
   string ptx_filename;
   if (need_optix_kernels) {
-    ptx_filename = path_get(
-        (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-            "lib/kernel_optix_shader_raytrace.ptx" :
-            "lib/kernel_optix.ptx");
+    ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx");
     if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
       std::string optix_include_dir = get_optix_include_dir();
       if (optix_include_dir.empty()) {
@@ -429,18 +454,41 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     optixModuleDestroy(optix_module);
     optix_module = NULL;
   }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     if (builtin_modules[i] != NULL) {
       optixModuleDestroy(builtin_modules[i]);
       builtin_modules[i] = NULL;
     }
   }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
     if (pipelines[i] != NULL) {
       optixPipelineDestroy(pipelines[i]);
       pipelines[i] = NULL;
     }
   }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+      groups[i] = NULL;
+    }
+  }
+
+#  ifdef WITH_OSL
+  /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  osl_modules.clear();
+
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+  osl_groups.clear();
+#  endif
 
   OptixModuleCompileOptions module_options = {};
   module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
@@ -461,7 +509,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   module_options.numPayloadTypes = 0;
 #  endif
 
-  OptixPipelineCompileOptions pipeline_options = {};
   /* Default to no motion blur and two-level graph, since it is the fastest option. */
   pipeline_options.usesMotionBlur = false;
   pipeline_options.traversableGraphFlags =
@@ -490,9 +537,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
    * This is necessary since objects may be reported to have motion if the Vector pass is
    * active, but may still need to be rendered without motion blur if that isn't active as well. */
-  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
-
-  if (motion_blur) {
+  if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
     pipeline_options.usesMotionBlur = true;
     /* Motion blur can insert motion transforms into the traversal graph.
      * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
@@ -503,13 +548,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     string ptx_data;
     if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
       string cflags = compile_kernel_get_common_cflags(kernel_features);
-      ptx_filename = compile_kernel(
-          cflags,
-          (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-              "kernel_shader_raytrace" :
-              "kernel",
-          "optix",
-          true);
+      ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
     }
     if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
       set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
@@ -551,7 +590,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   }
 
   /* Create program groups. */
-  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
   group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
@@ -609,7 +647,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
       group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
 
-      if (motion_blur) {
+      if (pipeline_options.usesMotionBlur) {
         builtin_options.usesMotionBlur = true;
 
         optix_assert(optixBuiltinISModuleGet(
@@ -630,7 +668,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     }
   }
 
-  /* Pointclouds */
   if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
     group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
     group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
@@ -642,8 +679,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
   }
 
+  /* Add hit group for local intersections. */
   if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
-    /* Add hit group for local intersections. */
     group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
     group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
     group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
@@ -655,16 +692,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
     group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
         "__raygen__kernel_optix_integrator_shade_surface_raytrace";
-    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
-    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
-        "__direct_callable__svm_node_bevel";
+
+    /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */
+    if (!use_osl) {
+      group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+      group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+          "__direct_callable__svm_node_bevel";
+    }
   }
 
-  /* MNEE. */
   if (kernel_features & KERNEL_FEATURE_MNEE) {
     group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
     group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
@@ -672,6 +712,42 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
         "__raygen__kernel_optix_integrator_shade_surface_mnee";
   }
 
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl) {
+    group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_background";
+    group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_light";
+    group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface";
+    group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_volume";
+    group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_shadow";
+    group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_displace";
+    group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_background";
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
+  }
+
   optix_assert(optixProgramGroupCreate(
       context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
 
@@ -680,7 +756,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Set up SBT, which in this case is used only to select between different programs. */
   sbt_data.alloc(NUM_PROGRAM_GROUPS);
   memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
     optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
     optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
   }
@@ -704,25 +780,26 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
 
   OptixPipelineLinkOptions link_options = {};
   link_options.maxTraceDepth = 1;
+  link_options.debugLevel = module_options.debugLevel;
 
-  if (DebugFlags().optix.use_debug) {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
-  }
-  else {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
-  }
-
-  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    /* Create shader raytracing pipeline. */
+  if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE) && !use_osl) {
+    /* Create shader raytracing and MNEE pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+    }
+    if (kernel_features & KERNEL_FEATURE_MNEE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    }
     pipeline_groups.push_back(groups[PG_MISS]);
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
     pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
@@ -730,8 +807,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
       pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
     }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
 
     optix_assert(optixPipelineCreate(context,
                                      &pipeline_options,
@@ -740,30 +815,33 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_SHADE_RAYTRACE]));
+                                     &pipelines[PIP_SHADE]));
 
     /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
                              link_options.maxTraceDepth * trace_css;
     const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
                                       stack_size[PG_CALL_SVM_BEVEL].dssDC);
 
     /* Set stack size depending on pipeline options. */
     optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  if (kernel_features & KERNEL_FEATURE_MNEE) {
-    /* Create MNEE pipeline. */
+  { /* Create intersection-only pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
     pipeline_groups.push_back(groups[PG_MISS]);
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
     pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
@@ -771,8 +849,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
       pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
     }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
 
     optix_assert(optixPipelineCreate(context,
                                      &pipeline_options,
@@ -781,37 +857,234 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_SHADE_MNEE]));
+                                     &pipelines[PIP_INTERSECT]));
 
-    /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG +
-                             link_options.maxTraceDepth * trace_css;
-    const unsigned int dss = 0;
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
 
-    /* Set stack size depending on pipeline options. */
-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2));
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  { /* Create intersection-only pipeline. */
+  return !have_error();
+}
+
+bool OptiXDevice::load_osl_kernels()
+{
+#  ifdef WITH_OSL
+  if (have_error()) {
+    return false;
+  }
+
+  struct OSLKernel {
+    string ptx;
+    string init_entry;
+    string exec_entry;
+  };
+
+  /* This has to be in the same order as the ShaderType enum, so that the index calculation in
+   * osl_eval_nodes checks out */
+  vector<OSLKernel> osl_kernels;
+
+  for (ShaderType type = SHADER_TYPE_SURFACE; type <= SHADER_TYPE_BUMP;
+       type = static_cast<ShaderType>(type + 1)) {
+    const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ?
+                                                     osl_globals.surface_state :
+                                                 type == SHADER_TYPE_VOLUME ?
+                                                     osl_globals.volume_state :
+                                                 type == SHADER_TYPE_DISPLACEMENT ?
+                                                     osl_globals.displacement_state :
+                                                     osl_globals.bump_state);
+    for (const OSL::ShaderGroupRef &group : groups) {
+      if (group) {
+        string osl_ptx, init_name, entry_name;
+        osl_globals.ss->getattribute(group.get(), "group_init_name", init_name);
+        osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name);
+        osl_globals.ss->getattribute(
+            group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
+
+        int groupdata_size = 0;
+        osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
+        if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
+          set_error(
+              string_printf("Requested OSL group data size (%d) is greater than the maximum "
+                            "supported with OptiX (2048)",
+                            groupdata_size));
+          return false;
+        }
+
+        osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)});
+      }
+      else {
+        /* Add empty entry for non-existent shader groups, so that the index stays stable. */
+        osl_kernels.emplace_back();
+      }
+    }
+  }
+
+  const CUDAContextScope scope(this);
+
+  if (pipelines[PIP_SHADE]) {
+    optixPipelineDestroy(pipelines[PIP_SHADE]);
+  }
+
+  for (OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+      module = NULL;
+    }
+  }
+  for (OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+      group = NULL;
+    }
+  }
+
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  OptixModuleCompileOptions module_options = {};
+  module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+  module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  osl_groups.resize(osl_kernels.size() * 2 + 1);
+  osl_modules.resize(osl_kernels.size() + 1);
+
+  { /* Load and compile PTX module with OSL services. */
+    string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx");
+    if (!path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
+                              ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &osl_modules.back());
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_desc = {};
+    group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
+    group_desc.callables.moduleDC = osl_modules.back();
+
+    optix_assert(optixProgramGroupCreate(
+        context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back()));
+  }
+
+  TaskPool pool;
+  vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+#    if OPTIX_ABI_VERSION >= 55
+    OptixTask task = nullptr;
+    results[i] = optixModuleCreateFromPTXWithTasks(context,
+                                                   &module_options,
+                                                   &pipeline_options,
+                                                   osl_kernels[i].ptx.data(),
+                                                   osl_kernels[i].ptx.size(),
+                                                   nullptr,
+                                                   nullptr,
+                                                   &osl_modules[i],
+                                                   &task);
+    if (results[i] == OPTIX_SUCCESS) {
+      execute_optix_task(pool, task, results[i]);
+    }
+#    else
+    pool.push([this, &results, i, &module_options, &osl_kernels]() {
+      results[i] = optixModuleCreateFromPTX(context,
+                                            &module_options,
+                                            &pipeline_options,
+                                            osl_kernels[i].ptx.data(),
+                                            osl_kernels[i].ptx.size(),
+                                            nullptr,
+                                            0,
+                                            &osl_modules[i]);
+    });
+#    endif
+  }
+
+  pool.wait_work();
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+    if (results[i] != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
+                              osl_kernels[i].init_entry.c_str(),
+                              optixGetErrorName(results[i])));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_descs[2] = {};
+    group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str();
+    group_descs[0].callables.moduleDC = osl_modules[i];
+    group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str();
+    group_descs[1].callables.moduleDC = osl_modules[i];
+
+    optix_assert(optixProgramGroupCreate(
+        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
+  }
+
+  vector<OptixStackSizes> osl_stack_size(osl_groups.size());
+
+  /* Update SBT with new entries. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+  }
+  for (size_t i = 0; i < osl_groups.size(); ++i) {
+    if (osl_groups[i] != NULL) {
+      optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
+      optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i]));
+    }
+  }
+  sbt_data.copy_to_device(); /* Upload updated SBT to device. */
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 0;
+  link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  {
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
-    pipeline_groups.push_back(groups[PG_MISS]);
-    pipeline_groups.push_back(groups[PG_HITD]);
-    pipeline_groups.push_back(groups[PG_HITS]);
-    pipeline_groups.push_back(groups[PG_HITL]);
-    pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
-      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-    }
-    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
-      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
-      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
+
+    for (const OptixProgramGroup &group : osl_groups) {
+      if (group != NULL) {
+        pipeline_groups.push_back(group);
+      }
     }
 
     optix_assert(optixPipelineCreate(context,
@@ -821,26 +1094,30 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_INTERSECT]));
+                                     &pipelines[PIP_SHADE]));
 
-    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
-    const unsigned int css =
-        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
-                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
-                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
-                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
-        link_options.maxTraceDepth * trace_css;
+    unsigned int dss = 0;
+    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
+      dss = std::max(dss, osl_stack_size[i].dssDC);
+    }
 
-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  /* Clean up program group objects. */
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-    optixProgramGroupDestroy(groups[i]);
-  }
+  return !have_error();
+#  else
+  return false;
+#  endif
+}
 
-  return true;
+void *OptiXDevice::get_cpu_osl_memory()
+{
+#  ifdef WITH_OSL
+  return &osl_globals;
+#  else
+  return NULL;
+#  endif
 }
 
 /* --------------------------------------------------------------------
@@ -1567,7 +1844,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
         num_motion_steps = hair->get_motion_steps();
       }
 
@@ -1721,7 +1998,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
         num_motion_steps = mesh->get_motion_steps();
       }
 
@@ -1788,7 +2065,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) {
+      if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
         num_motion_steps = pointcloud->get_motion_steps();
       }
 
@@ -1885,7 +2162,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
     /* Calculate total motion transform size and allocate memory for them. */
     size_t motion_transform_offset = 0;
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       size_t total_motion_transform_size = 0;
       for (Object *const ob : bvh->objects) {
         if (ob->is_traceable() && ob->use_motion()) {
@@ -1936,7 +2213,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
           static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-        if (motion_blur && ob->get_geometry()->has_motion_blur()) {
+        if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
           /* Select between motion blur and non-motion blur built-in intersection module. */
           instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
         }
@@ -1964,7 +2241,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       }
 
       /* Insert motion traversable if object has motion. */
-      if (motion_blur && ob->use_motion()) {
+      if (pipeline_options.usesMotionBlur && ob->use_motion()) {
         size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
         size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
                                        motion_keys * sizeof(OptixSRTData);
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 76c8af9bc3f..ad0e7b93454 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -9,6 +9,7 @@
 #  include "device/cuda/device_impl.h"
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
+#  include "kernel/osl/globals.h"
 #  include "kernel/types.h"
 #  include "util/unique_ptr.h"
 
@@ -23,8 +24,16 @@ enum {
   PG_RGEN_INTERSECT_SHADOW,
   PG_RGEN_INTERSECT_SUBSURFACE,
   PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_BACKGROUND,
+  PG_RGEN_SHADE_LIGHT,
+  PG_RGEN_SHADE_SURFACE,
   PG_RGEN_SHADE_SURFACE_RAYTRACE,
   PG_RGEN_SHADE_SURFACE_MNEE,
+  PG_RGEN_SHADE_VOLUME,
+  PG_RGEN_SHADE_SHADOW,
+  PG_RGEN_EVAL_DISPLACE,
+  PG_RGEN_EVAL_BACKGROUND,
+  PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY,
   PG_MISS,
   PG_HITD, /* Default hit group. */
   PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@@ -40,14 +49,14 @@ enum {
 };
 
 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
-static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int NUM_MISS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
 static const int NUM_HIT_PROGRAM_GROUPS = 8;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;
 
 /* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES };
 
 /* A single shader binding table entry. */
 struct SbtRecord {
@@ -61,12 +70,20 @@ class OptiXDevice : public CUDADevice {
   OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
   OptixModule builtin_modules[2] = {};
   OptixPipeline pipelines[NUM_PIPELINES] = {};
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixPipelineCompileOptions pipeline_options = {};
 
-  bool motion_blur = false;
   device_vector<SbtRecord> sbt_data;
   device_only_memory<KernelParamsOptiX> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
 
+#  ifdef WITH_OSL
+  OSLGlobals osl_globals;
+  vector<OptixModule> osl_modules;
+  vector<OptixProgramGroup> osl_groups;
+#  endif
+
+ private:
+  OptixTraversableHandle tlas_handle = 0;
   vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
   thread_mutex delayed_free_bvh_mutex;
 
@@ -100,13 +117,14 @@ class OptiXDevice : public CUDADevice {
   OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
   ~OptiXDevice();
 
- private:
   BVHLayoutMask get_bvh_layout_mask() const override;
 
   string compile_kernel_get_common_cflags(const uint kernel_features);
 
   bool load_kernels(const uint kernel_features) override;
 
+  bool load_osl_kernels() override;
+
   bool build_optix_bvh(BVHOptiX *bvh,
                        OptixBuildOperation operation,
                        const OptixBuildInput &build_input,
@@ -123,6 +141,8 @@ class OptiXDevice : public CUDADevice {
 
   virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
 
+  void *get_cpu_osl_memory() override;
+
   /* --------------------------------------------------------------------
    * Denoising.
    */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 3bc547ed11d..1bfd154d449 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution()
   CUDADeviceQueue::init_execution();
 }
 
-static bool is_optix_specific_kernel(DeviceKernel kernel)
+static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl)
 {
-  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+#  ifdef WITH_OSL
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl && device_kernel_has_shading(kernel)) {
+    return true;
+  }
+#  else
+  (void)use_osl;
+#  endif
+
+  return device_kernel_has_intersection(kernel);
 }
 
 bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                                const int work_size,
                                DeviceKernelArguments const &args)
 {
-  if (!is_optix_specific_kernel(kernel)) {
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+#  ifdef WITH_OSL
+  const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use;
+#  else
+  const bool use_osl = false;
+#  endif
+
+  if (!is_optix_specific_kernel(kernel, use_osl)) {
     return CUDADeviceQueue::enqueue(kernel, work_size, args);
   }
 
@@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
 
   const CUDAContextScope scope(cuda_device_);
 
-  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
-
   const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
   const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
 
@@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                         sizeof(device_ptr),
                         cuda_stream_));
 
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) {
     cuda_device_assert(
         cuda_device_,
         cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                           sizeof(device_ptr),
                           cuda_stream_));
   }
+  if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) {
+    cuda_device_assert(cuda_device_,
+                       cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset),
+                                         args.values[2],  // &d_offset
+                                         sizeof(int32_t),
+                                         cuda_stream_));
+  }
 
   cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
 
@@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
   OptixShaderBindingTable sbt_params = {};
 
   switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord);
+      break;
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
-      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
       break;
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
-      pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
       break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord);
+      break;
+
     case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
       pipeline = optix_device->pipelines[PIP_INTERSECT];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
       break;
 
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr +
+                                PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord);
+      break;
+
     default:
       LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
                  << " is attempted to be enqueued.";
@@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
 
   sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
   sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS;
   sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
   sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
   sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
@@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
   sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
   sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
 
+#  ifdef WITH_OSL
+  if (use_osl) {
+    sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size());
+  }
+#  endif
+
   /* Launch the ray generation program. */
   optix_device_assert(optix_device,
                       optixLaunch(pipeline,
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3fbb346e94f..99f9e536977 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -37,6 +37,14 @@ set(SRC_KERNEL_DEVICE_OPTIX
   device/optix/kernel_shader_raytrace.cu
 )
 
+if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+  set(SRC_KERNEL_DEVICE_OPTIX
+    ${SRC_KERNEL_DEVICE_OPTIX}
+    osl/services_optix.cu
+    device/optix/kernel_osl.cu
+  )
+endif()
+
 set(SRC_KERNEL_DEVICE_ONEAPI
   device/oneapi/kernel.cpp
 )
@@ -181,6 +189,16 @@ set(SRC_KERNEL_SVM_HEADERS
   svm/vertex_color.h
 )
 
+if(WITH_CYCLES_OSL)
+  set(SRC_KERNEL_OSL_HEADERS
+    osl/osl.h
+    osl/closures_setup.h
+    osl/closures_template.h
+    osl/services_gpu.h
+    osl/types.h
+  )
+endif()
+
 set(SRC_KERNEL_GEOM_HEADERS
   geom/geom.h
   geom/attribute.h
@@ -306,6 +324,7 @@ set(SRC_KERNEL_HEADERS
   ${SRC_KERNEL_GEOM_HEADERS}
   ${SRC_KERNEL_INTEGRATOR_HEADERS}
   ${SRC_KERNEL_LIGHT_HEADERS}
+  ${SRC_KERNEL_OSL_HEADERS}
   ${SRC_KERNEL_SAMPLE_HEADERS}
   ${SRC_KERNEL_SVM_HEADERS}
   ${SRC_KERNEL_TYPES_HEADERS}
@@ -708,6 +727,16 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
     kernel_optix_shader_raytrace
     "device/optix/kernel_shader_raytrace.cu"
     "--keep-device-functions")
+  if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl
+      "device/optix/kernel_osl.cu"
+      "--relocatable-device-code=true")
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl_services
+      "osl/services_optix.cu"
+      "--relocatable-device-code=true")
+  endif()
 
   add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
   cycles_set_solution_folder(cycles_kernel_optix)
@@ -995,6 +1024,7 @@ source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_KERNEL_TYPES_HEADERS})
 source_group("light" FILES ${SRC_KERNEL_LIGHT_HEADERS})
+source_group("osl" FILES ${SRC_KERNEL_OSL_HEADERS})
 source_group("sample" FILES ${SRC_KERNEL_SAMPLE_HEADERS})
 source_group("svm" FILES ${SRC_KERNEL_SVM_HEADERS})
 source_group("util" FILES ${SRC_KERNEL_UTIL_HEADERS})
@@ -1031,6 +1061,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLE
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_LIGHT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/light)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_OSL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/osl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SAMPLE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/sample)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_TYPES_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 71af68aa80e..2f5c5d7bd0c 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -297,8 +297,10 @@ ccl_device_inline void bsdf_roughness_eta(const KernelGlobals kg,
                                           ccl_private float2 *roughness,
                                           ccl_private float *eta)
 {
+#ifdef __SVM__
   bool refractive = false;
   float alpha = 1.0f;
+#endif
   switch (sc->type) {
     case CLOSURE_BSDF_DIFFUSE_ID:
       *roughness = one_float2();
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 51e1381d552..3a950779c11 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -30,6 +30,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */
 
 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #if __CUDA_ARCH__ < 500
 #  define ccl_device_inline __device__ __forceinline__
 #  define ccl_device_forceinline __device__ __forceinline__
@@ -109,14 +110,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D
 
 typedef unsigned short half;
 
-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
   half val;
   asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
   return val;
 }
 
-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
   float val;
   asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 648988c31b6..8755395c82c 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -28,6 +28,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */
 
 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline __device__ __inline__
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index f689e93e5a2..2dd6cc98b59 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -38,6 +38,7 @@ using namespace metal::raytracing;
 #  define ccl_device_noinline ccl_device __attribute__((noinline))
 #endif
 
+#define ccl_device_extern extern "C"
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index dfaec65130c..b83512180d7 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -28,6 +28,7 @@
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device
+#define ccl_device_extern extern "C"
 #define ccl_global
 #define ccl_always_inline __attribute__((always_inline))
 #define ccl_device_inline inline
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index 1a11a533b7e..e13101f57b8 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -33,14 +33,16 @@ typedef unsigned long long uint64_t;
 #endif
 
 #define ccl_device \
-  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+  static __device__ \
+      __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
-#define ccl_device_inline_method ccl_device
-#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_inline_method __device__ __forceinline__
+#define ccl_device_noinline static __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_inline_constant __constant__
+#define ccl_inline_constant static __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -57,23 +59,6 @@ typedef unsigned long long uint64_t;
 
 #define kernel_assert(cond)
 
-/* GPU thread, block, grid size and index */
-
-#define ccl_gpu_thread_idx_x (threadIdx.x)
-#define ccl_gpu_block_dim_x (blockDim.x)
-#define ccl_gpu_block_idx_x (blockIdx.x)
-#define ccl_gpu_grid_dim_x (gridDim.x)
-#define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
-#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
-
-/* GPU warp synchronization. */
-
-#define ccl_gpu_syncthreads() __syncthreads()
-#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
-
 /* GPU texture objects */
 
 typedef unsigned long long CUtexObject;
@@ -101,14 +86,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D
 
 typedef unsigned short half;
 
-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
   half val;
   asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
   return val;
 }
 
-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
   float val;
   asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
index 7af2e421378..126df74bc8c 100644
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -25,6 +25,7 @@ struct KernelParamsOptiX {
   /* Kernel arguments */
   const int *path_index_array;
   float *render_buffer;
+  int offset;
 
   /* Global scene data and textures */
   KernelData data;
@@ -36,7 +37,11 @@ struct KernelParamsOptiX {
 };
 
 #ifdef __NVCC__
-extern "C" static __constant__ KernelParamsOptiX kernel_params;
+extern "C"
+#  ifndef __CUDACC_RDC__
+    static
+#  endif
+    __constant__ KernelParamsOptiX kernel_params;
 #endif
 
 /* Abstraction macros */
diff --git a/intern/cycles/kernel/device/optix/kernel_osl.cu b/intern/cycles/kernel/device/optix/kernel_osl.cu
new file mode 100644
index 00000000000..0f3f477935b
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_osl.cu
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+/* Copy of the regular OptiX kernels with additional OSL support. */
+
+#include "kernel/device/optix/kernel_shader_raytrace.cu"
+
+#include "kernel/bake/bake.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_volume.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_background()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_background(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_light()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_light(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_surface(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_volume()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_volume(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_shadow()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_shadow(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_displace()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_displace_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_background()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_background_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_curve_shadow_transparency()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_curve_shadow_transparency_evaluate(nullptr, input, output, global_index);
+}
diff --git a/intern/cycles/kernel/integrator/displacement_shader.h b/intern/cycles/kernel/integrator/displacement_shader.h
index 839dfe244ac..a6e9d674396 100644
--- a/intern/cycles/kernel/integrator/displacement_shader.h
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@@ -24,8 +24,8 @@ ccl_device void displacement_shader_eval(KernelGlobals kg,
 
   /* this will modify sd->P */
 #ifdef __OSL__
-  if (kg->osl) {
-    OSLShader::eval_displacement(kg, state, sd);
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(kg, state, sd, 0);
   }
   else
 #endif
diff --git a/intern/cycles/kernel/integrator/surface_shader.h b/intern/cycles/kernel/integrator/surface_shader.h
index 6c0097b11bd..5e47a34f77e 100644
--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -827,13 +827,8 @@ ccl_device void surface_shader_eval(KernelGlobals kg,
   sd->num_closure_left = max_closures;
 
 #ifdef __OSL__
-  if (kg->osl) {
-    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, state, sd, path_flag);
-    }
-    else {
-      OSLShader::eval_surface(kg, state, sd, path_flag);
-    }
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_SURFACE>(kg, state, sd, path_flag);
   }
   else
 #endif
diff --git a/intern/cycles/kernel/integrator/volume_shader.h b/intern/cycles/kernel/integrator/volume_shader.h
index 0ff968723a1..f9050647c6d 100644
--- a/intern/cycles/kernel/integrator/volume_shader.h
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@@ -493,8 +493,8 @@ ccl_device_inline void volume_shader_eval(KernelGlobals kg,
 
     /* evaluate shader */
 #  ifdef __OSL__
-    if (kg->osl) {
-      OSLShader::eval_volume(kg, state, sd, path_flag);
+    if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+      osl_eval_nodes<SHADER_TYPE_VOLUME>(kg, state, sd, path_flag);
     }
     else
 #  endif
diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp
index d56e0551a91..6800c765345 100644
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -25,13 +25,18 @@
 
 #include "kernel/osl/osl.h"
 
-#include "kernel/osl/closures_setup.h"
-
 #define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z)
 #define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])
 
 CCL_NAMESPACE_BEGIN
 
+static_assert(sizeof(OSLClosure) == sizeof(OSL::ClosureColor) &&
+              sizeof(OSLClosureAdd) == sizeof(OSL::ClosureAdd) &&
+              sizeof(OSLClosureMul) == sizeof(OSL::ClosureMul) &&
+              sizeof(OSLClosureComponent) == sizeof(OSL::ClosureComponent));
+static_assert(sizeof(ShaderGlobals) == sizeof(OSL::ShaderGlobals) &&
+              offsetof(ShaderGlobals, Ci) == offsetof(OSL::ShaderGlobals, Ci));
+
 /* Registration */
 
 #define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
@@ -60,53 +65,18 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
 #include "closures_template.h"
 }
 
-/* Globals */
+/* Surface & Background */
 
-static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
-                                        ShaderData *sd,
-                                        const void *state,
-                                        uint32_t path_flag,
-                                        OSLThreadData *tdata)
+template<>
+void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
+                                         const void *state,
+                                         ShaderData *sd,
+                                         uint32_t path_flag)
 {
-  OSL::ShaderGlobals *globals = &tdata->globals;
-
-  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
-  const differential3 dI = differential_from_compact(sd->I, sd->dI);
-
-  /* copy from shader data to shader globals */
-  globals->P = TO_VEC3(sd->P);
-  globals->dPdx = TO_VEC3(dP.dx);
-  globals->dPdy = TO_VEC3(dP.dy);
-  globals->I = TO_VEC3(sd->I);
-  globals->dIdx = TO_VEC3(dI.dx);
-  globals->dIdy = TO_VEC3(dI.dy);
-  globals->N = TO_VEC3(sd->N);
-  globals->Ng = TO_VEC3(sd->Ng);
-  globals->u = sd->u;
-  globals->dudx = sd->du.dx;
-  globals->dudy = sd->du.dy;
-  globals->v = sd->v;
-  globals->dvdx = sd->dv.dx;
-  globals->dvdy = sd->dv.dy;
-  globals->dPdu = TO_VEC3(sd->dPdu);
-  globals->dPdv = TO_VEC3(sd->dPdv);
-  globals->surfacearea = 1.0f;
-  globals->time = sd->time;
-
-  /* booleans */
-  globals->raytype = path_flag;
-  globals->flipHandedness = 0;
-  globals->backfacing = (sd->flag & SD_BACKFACING);
-
-  /* shader data to be used in services callbacks */
-  globals->renderstate = sd;
-
-  /* hacky, we leave it to services to fetch actual object matrix */
-  globals->shader2common = sd;
-  globals->object2common = sd;
-
-  /* must be set to NULL before execute */
-  globals->Ci = NULL;
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
 
   /* clear trace data */
   tdata->tracedata.init = false;
@@ -121,53 +91,6 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
     sd->osl_path_state = (const IntegratorStateCPU *)state;
     sd->osl_shadow_path_state = nullptr;
   }
-}
-
-static void flatten_closure_tree(const KernelGlobalsCPU *kg,
-                                 ShaderData *sd,
-                                 uint32_t path_flag,
-                                 const OSL::ClosureColor *closure,
-                                 float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
-  /* OSL gives us a closure tree, we flatten it into arrays per
-   * closure type, for evaluation, sampling, etc later on. */
-
-  switch (closure->id) {
-    case OSL::ClosureColor::MUL: {
-      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-      flatten_closure_tree(kg, sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
-      break;
-    }
-    case OSL::ClosureColor::ADD: {
-      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-      flatten_closure_tree(kg, sd, path_flag, add->closureA, weight);
-      flatten_closure_tree(kg, sd, path_flag, add->closureB, weight);
-      break;
-    }
-#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
-  case OSL_CLOSURE_##Upper##_ID: { \
-    const OSL::ClosureComponent *comp = reinterpret_cast<const OSL::ClosureComponent *>(closure); \
-    weight *= TO_FLOAT3(comp->w); \
-    osl_closure_##lower##_setup( \
-        kg, sd, path_flag, weight, reinterpret_cast<const Upper##Closure *>(comp + 1)); \
-    break; \
-  }
-#include "closures_template.h"
-    default:
-      break;
-  }
-}
-
-/* Surface */
-
-void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
-                             const void *state,
-                             ShaderData *sd,
-                             uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
 
   /* execute shader for this point */
   OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -175,101 +98,99 @@ void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
   OSL::ShadingContext *octx = tdata->context;
   int shader = sd->shader & SHADER_MASK;
 
-  /* automatic bump shader */
-  if (kg->osl->bump_state[shader]) {
-    /* save state */
-    const float3 P = sd->P;
-    const float dP = sd->dP;
-    const OSL::Vec3 dPdx = globals->dPdx;
-    const OSL::Vec3 dPdy = globals->dPdy;
-
-    /* set state as if undisplaced */
-    if (sd->flag & SD_HAS_DISPLACEMENT) {
-      float data[9];
-      bool found = kg->osl->services->get_attribute(sd,
-                                                    true,
-                                                    OSLRenderServices::u_empty,
-                                                    TypeDesc::TypeVector,
-                                                    OSLRenderServices::u_geom_undisplaced,
-                                                    data);
-      (void)found;
-      assert(found);
-
-      differential3 tmp_dP;
-      memcpy(&sd->P, data, sizeof(float) * 3);
-      memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
-      memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
-
-      object_position_transform(kg, sd, &sd->P);
-      object_dir_transform(kg, sd, &tmp_dP.dx);
-      object_dir_transform(kg, sd, &tmp_dP.dy);
-
-      sd->dP = differential_make_compact(tmp_dP);
-
-      globals->P = TO_VEC3(sd->P);
-      globals->dPdx = TO_VEC3(tmp_dP.dx);
-      globals->dPdy = TO_VEC3(tmp_dP.dy);
+  if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+    /* background */
+    if (kg->osl->background_state) {
+      ss->execute(octx, *(kg->osl->background_state), *globals);
     }
-
-    /* execute bump shader */
-    ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
-
-    /* reset state */
-    sd->P = P;
-    sd->dP = dP;
-
-    globals->P = TO_VEC3(P);
-    globals->dPdx = TO_VEC3(dPdx);
-    globals->dPdy = TO_VEC3(dPdy);
   }
+  else {
+    /* automatic bump shader */
+    if (kg->osl->bump_state[shader]) {
+      /* save state */
+      const float3 P = sd->P;
+      const float dP = sd->dP;
+      const OSL::Vec3 dPdx = globals->dPdx;
+      const OSL::Vec3 dPdy = globals->dPdy;
+
+      /* set state as if undisplaced */
+      if (sd->flag & SD_HAS_DISPLACEMENT) {
+        float data[9];
+        bool found = kg->osl->services->get_attribute(sd,
+                                                      true,
+                                                      OSLRenderServices::u_empty,
+                                                      TypeDesc::TypeVector,
+                                                      OSLRenderServices::u_geom_undisplaced,
+                                                      data);
+        (void)found;
+        assert(found);
+
+        differential3 tmp_dP;
+        memcpy(&sd->P, data, sizeof(float) * 3);
+        memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
+        memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
+
+        object_position_transform(kg, sd, &sd->P);
+        object_dir_transform(kg, sd, &tmp_dP.dx);
+        object_dir_transform(kg, sd, &tmp_dP.dy);
+
+        sd->dP = differential_make_compact(tmp_dP);
+
+        globals->P = TO_VEC3(sd->P);
+        globals->dPdx = TO_VEC3(tmp_dP.dx);
+        globals->dPdy = TO_VEC3(tmp_dP.dy);
+      }
+
+      /* execute bump shader */
+      ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
+
+      /* reset state */
+      sd->P = P;
+      sd->dP = dP;
+
+      globals->P = TO_VEC3(P);
+      globals->dPdx = TO_VEC3(dPdx);
+      globals->dPdy = TO_VEC3(dPdy);
+    }
 
-  /* surface shader */
-  if (kg->osl->surface_state[shader]) {
-    ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    /* surface shader */
+    if (kg->osl->surface_state[shader]) {
+      ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    }
   }
 
   /* flatten closure tree */
   if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
   }
 }
 
-/* Background */
+/* Volume */
 
-void OSLShader::eval_background(const KernelGlobalsCPU *kg,
-                                const void *state,
-                                ShaderData *sd,
-                                uint32_t path_flag)
+template<>
+void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
+                                        const void *state,
+                                        ShaderData *sd,
+                                        uint32_t path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
 
-  /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  /* clear trace data */
+  tdata->tracedata.init = false;
 
-  if (kg->osl->background_state) {
-    ss->execute(octx, *(kg->osl->background_state), *globals);
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_path_state = nullptr;
+    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
   }
-
-  /* return background color immediately */
-  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+  else {
+    sd->osl_path_state = (const IntegratorStateCPU *)state;
+    sd->osl_shadow_path_state = nullptr;
   }
-}
-
-/* Volume */
-
-void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
-                            const void *state,
-                            ShaderData *sd,
-                            uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
 
   /* execute shader */
   OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -283,17 +204,30 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
 
   /* flatten closure tree */
   if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
   }
 }
 
 /* Displacement */
 
-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
+template<>
+void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
+                                              const void *state,
+                                              ShaderData *sd,
+                                              uint32_t path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+
+  /* clear trace data */
+  tdata->tracedata.init = false;
+
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  sd->osl_path_state = (const IntegratorStateCPU *)state;
+  sd->osl_shadow_path_state = nullptr;
 
   /* execute shader */
   OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
diff --git a/intern/cycles/kernel/osl/closures_setup.h b/intern/cycles/kernel/osl/closures_setup.h
index 96c551b9951..ceaf56ccba6 100644
--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -40,12 +40,7 @@ CCL_NAMESPACE_BEGIN
     const char *label;
 #define OSL_CLOSURE_STRUCT_END(Upper, lower) \
   } \
-  ; \
-  ccl_device void osl_closure_##lower##_setup(KernelGlobals kg, \
-                                              ccl_private ShaderData *sd, \
-                                              uint32_t path_flag, \
-                                              float3 weight, \
-                                              ccl_private Upper##Closure *closure);
+  ;
 #define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) type name;
 #define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) type name[size];
 
@@ -210,11 +205,9 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
   bsdf->ior = closure->ior;
   bsdf->T = closure->T;
 
-  static OSL::ustring u_ggx("ggx");
-  static OSL::ustring u_default("default");
-
   /* GGX */
-  if (closure->distribution == u_ggx || closure->distribution == u_default) {
+  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
+      closure->distribution == make_string("default", 4430693559278735917ull)) {
     if (!closure->refract) {
       if (closure->alpha_x == closure->alpha_y) {
         /* Isotropic */
@@ -1000,18 +993,14 @@ ccl_device void osl_closure_bssrdf_setup(KernelGlobals kg,
                                          float3 weight,
                                          ccl_private const BSSRDFClosure *closure)
 {
-  static ustring u_burley("burley");
-  static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
-  static ustring u_random_walk("random_walk");
-
   ClosureType type;
-  if (closure->method == u_burley) {
+  if (closure->method == make_string("burley", 186330084368958868ull)) {
     type = CLOSURE_BSSRDF_BURLEY_ID;
   }
-  else if (closure->method == u_random_walk_fixed_radius) {
+  else if (closure->method == make_string("random_walk_fixed_radius", 5695810351010063150ull)) {
     type = CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID;
   }
-  else if (closure->method == u_random_walk) {
+  else if (closure->method == make_string("random_walk", 11360609267673527222ull)) {
     type = CLOSURE_BSSRDF_RANDOM_WALK_ID;
   }
   else {
diff --git a/intern/cycles/kernel/osl/closures_template.h b/intern/cycles/kernel/osl/closures_template.h
index c808b275966..b9e9b52dcf8 100644
--- a/intern/cycles/kernel/osl/closures_template.h
+++ b/intern/cycles/kernel/osl/closures_template.h
@@ -40,7 +40,7 @@ OSL_CLOSURE_STRUCT_BEGIN(Transparent, transparent)
 OSL_CLOSURE_STRUCT_END(Transparent, transparent)
 
 OSL_CLOSURE_STRUCT_BEGIN(Microfacet, microfacet)
-  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, ustring, distribution, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, DeviceString, distribution, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, N, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, T, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_x, NULL)
@@ -210,7 +210,7 @@ OSL_CLOSURE_STRUCT_BEGIN(PhongRamp, phong_ramp)
 OSL_CLOSURE_STRUCT_END(PhongRamp, phong_ramp)
 
 OSL_CLOSURE_STRUCT_BEGIN(BSSRDF, bssrdf)
-  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, ustring, method, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, DeviceString, method, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, N, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, radius, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, albedo, NULL)
diff --git a/intern/cycles/kernel/osl/osl.h b/intern/cycles/kernel/osl/osl.h
index bef23f3eea1..cc5c81ad027 100644
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -1,38 +1,171 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */
 
 #pragma once
 
 /* OSL Shader Engine
  *
- * Holds all variables to execute and use OSL shaders from the kernel. These
- * are initialized externally by OSLShaderManager before rendering starts.
- *
- * Before/after a thread starts rendering, thread_init/thread_free must be
- * called, which will store any per thread OSL state in thread local storage.
- * This means no thread state must be passed along in the kernel itself.
+ * Holds all variables to execute and use OSL shaders from the kernel.
  */
 
 #include "kernel/osl/types.h"
 
+#include "kernel/osl/closures_setup.h"
+
 CCL_NAMESPACE_BEGIN
 
-class OSLShader {
- public:
-  /* eval */
-  static void eval_surface(const KernelGlobalsCPU *kg,
-                           const void *state,
-                           ShaderData *sd,
-                           uint32_t path_flag);
-  static void eval_background(const KernelGlobalsCPU *kg,
-                              const void *state,
-                              ShaderData *sd,
-                              uint32_t path_flag);
-  static void eval_volume(const KernelGlobalsCPU *kg,
-                          const void *state,
-                          ShaderData *sd,
-                          uint32_t path_flag);
-  static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);
-};
+ccl_device_inline void shaderdata_to_shaderglobals(KernelGlobals kg,
+                                                   ccl_private ShaderData *sd,
+                                                   uint32_t path_flag,
+                                                   ccl_private ShaderGlobals *globals)
+{
+  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+  const differential3 dI = differential_from_compact(sd->I, sd->dI);
+
+  /* copy from shader data to shader globals */
+  globals->P = sd->P;
+  globals->dPdx = dP.dx;
+  globals->dPdy = dP.dy;
+  globals->I = sd->I;
+  globals->dIdx = dI.dx;
+  globals->dIdy = dI.dy;
+  globals->N = sd->N;
+  globals->Ng = sd->Ng;
+  globals->u = sd->u;
+  globals->dudx = sd->du.dx;
+  globals->dudy = sd->du.dy;
+  globals->v = sd->v;
+  globals->dvdx = sd->dv.dx;
+  globals->dvdy = sd->dv.dy;
+  globals->dPdu = sd->dPdu;
+  globals->dPdv = sd->dPdv;
+  globals->time = sd->time;
+  globals->dtime = 1.0f;
+  globals->surfacearea = 1.0f;
+  globals->raytype = path_flag;
+  globals->flipHandedness = 0;
+  globals->backfacing = (sd->flag & SD_BACKFACING);
+
+  /* shader data to be used in services callbacks */
+  globals->renderstate = sd;
+
+  /* hacky, we leave it to services to fetch actual object matrix */
+  globals->shader2common = sd;
+  globals->object2common = sd;
+
+  /* must be set to NULL before execute */
+  globals->Ci = nullptr;
+}
+
+ccl_device void flatten_closure_tree(KernelGlobals kg,
+                                     ccl_private ShaderData *sd,
+                                     uint32_t path_flag,
+                                     ccl_private const OSLClosure *closure)
+{
+  int stack_size = 0;
+  float3 weight = one_float3();
+  float3 weight_stack[16];
+  ccl_private const OSLClosure *closure_stack[16];
+
+  while (closure) {
+    switch (closure->id) {
+      case OSL_CLOSURE_MUL_ID: {
+        ccl_private const OSLClosureMul *mul = static_cast<ccl_private const OSLClosureMul *>(
+            closure);
+        weight *= mul->weight;
+        closure = mul->closure;
+        continue;
+      }
+      case OSL_CLOSURE_ADD_ID: {
+        if (stack_size >= 16) {
+          kernel_assert(!"Exhausted OSL closure stack");
+          break;
+        }
+        ccl_private const OSLClosureAdd *add = static_cast<ccl_private const OSLClosureAdd *>(
+            closure);
+        closure = add->closureA;
+        weight_stack[stack_size] = weight;
+        closure_stack[stack_size++] = add->closureB;
+        continue;
+      }
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  case OSL_CLOSURE_##Upper##_ID: { \
+    ccl_private const OSLClosureComponent *comp = \
+        static_cast<ccl_private const OSLClosureComponent *>(closure); \
+    osl_closure_##lower##_setup(kg, \
+                                sd, \
+                                path_flag, \
+                                weight * comp->weight, \
+                                reinterpret_cast<ccl_private const Upper##Closure *>(comp + 1)); \
+    break; \
+  }
+#include "closures_template.h"
+      default:
+        break;
+    }
+
+    if (stack_size > 0) {
+      weight = weight_stack[--stack_size];
+      closure = closure_stack[stack_size];
+    }
+    else {
+      closure = nullptr;
+    }
+  }
+}
+
+#ifndef __KERNEL_GPU__
+
+template<ShaderType type>
+void osl_eval_nodes(const KernelGlobalsCPU *kg,
+                    const void *state,
+                    ShaderData *sd,
+                    uint32_t path_flag);
+
+#else
+
+template<ShaderType type, typename ConstIntegratorGenericState>
+ccl_device_inline void osl_eval_nodes(KernelGlobals kg,
+                                      ConstIntegratorGenericState state,
+                                      ccl_private ShaderData *sd,
+                                      uint32_t path_flag)
+{
+  ShaderGlobals globals;
+  shaderdata_to_shaderglobals(kg, sd, path_flag, &globals);
+
+  const int shader = sd->shader & SHADER_MASK;
+
+#  ifdef __KERNEL_OPTIX__
+  uint8_t group_data[2048];
+  uint8_t closure_pool[1024];
+  sd->osl_closure_pool = closure_pool;
+
+  unsigned int optix_dc_index = 2 /* NUM_CALLABLE_PROGRAM_GROUPS */ +
+                                (shader + type * kernel_data.max_shaders) * 2;
+  optixDirectCall<void>(optix_dc_index + 0,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+  optixDirectCall<void>(optix_dc_index + 1,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+#  endif
+
+  if (globals.Ci) {
+    flatten_closure_tree(kg, sd, path_flag, globals.Ci);
+  }
+}
+
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index b744422ee78..454b75ea4d9 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -119,8 +119,8 @@ ustring OSLRenderServices::u_u("u");
 ustring OSLRenderServices::u_v("v");
 ustring OSLRenderServices::u_empty;
 
-OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system)
-    : OSL::RendererServices(texture_system)
+OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system, int device_type)
+    : OSL::RendererServices(texture_system), device_type_(device_type)
 {
 }
 
@@ -131,6 +131,17 @@ OSLRenderServices::~OSLRenderServices()
   }
 }
 
+int OSLRenderServices::supports(string_view feature) const
+{
+#ifdef WITH_OPTIX
+  if (feature == "OptiX") {
+    return device_type_ == DEVICE_OPTIX;
+  }
+#endif
+
+  return false;
+}
+
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    OSL::Matrix44 &result,
                                    OSL::TransformationPtr xform,
@@ -1139,29 +1150,39 @@ TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring file
 {
   OSLTextureHandleMap::iterator it = textures.find(filename);
 
-  /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
-  if (it != textures.end()) {
-    if (it->second->type != OSLTextureHandle::OIIO) {
-      return (TextureSystem::TextureHandle *)it->second.get();
+  if (device_type_ == DEVICE_CPU) {
+    /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
+    if (it != textures.end()) {
+      if (it->second->type != OSLTextureHandle::OIIO) {
+        return (TextureSystem::TextureHandle *)it->second.get();
+      }
     }
-  }
 
-  /* Get handle from OpenImageIO. */
-  OSL::TextureSystem *ts = m_texturesys;
-  TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
-  if (handle == NULL) {
-    return NULL;
-  }
+    /* Get handle from OpenImageIO. */
+    OSL::TextureSystem *ts = m_texturesys;
+    TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
+    if (handle == NULL) {
+      return NULL;
+    }
+
+    /* Insert new OSLTextureHandle if needed. */
+    if (it == textures.end()) {
+      textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
+      it = textures.find(filename);
+    }
 
-  /* Insert new OSLTextureHandle if needed. */
-  if (it == textures.end()) {
-    textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
-    it = textures.find(filename);
+    /* Assign OIIO texture handle and return. */
+    it->second->oiio_handle = handle;
+    return (TextureSystem::TextureHandle *)it->second.get();
   }
+  else {
+    if (it != textures.end() && it->second->type == OSLTextureHandle::SVM && it->second->svm_slots[0].w == -1) {
+        return reinterpret_cast<TextureSystem::TextureHandle *>(
+            static_cast<uintptr_t>(it->second->svm_slots[0].y + 1));
+    }
 
-  /* Assign OIIO texture handle and return. */
-  it->second->oiio_handle = handle;
-  return (TextureSystem::TextureHandle *)it->second.get();
+    return NULL;
+  }
 }
 
 bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h
index 334b6682e34..9d875ae8e94 100644
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -22,11 +22,8 @@ class PtexCache;
 
 CCL_NAMESPACE_BEGIN
 
-class Object;
 class Scene;
-class Shader;
 struct ShaderData;
-struct float3;
 struct KernelGlobalsCPU;
 
 /* OSL Texture Handle
@@ -73,11 +70,13 @@ typedef OIIO::unordered_map_concurrent<ustring, OSLTextureHandleRef, ustringHash
 
 class OSLRenderServices : public OSL::RendererServices {
  public:
-  OSLRenderServices(OSL::TextureSystem *texture_system);
+  OSLRenderServices(OSL::TextureSystem *texture_system, int device_type);
   ~OSLRenderServices();
 
   static void register_closures(OSL::ShadingSystem *ss);
 
+  int supports(string_view feature) const override;
+
   bool get_matrix(OSL::ShaderGlobals *sg,
                   OSL::Matrix44 &result,
                   OSL::TransformationPtr xform,
@@ -324,6 +323,9 @@ class OSLRenderServices : public OSL::RendererServices {
    * and is required because texture handles are cached as part of the shared
    * shading system. */
   OSLTextureHandleMap textures;
+
+ private:
+  int device_type_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/services_gpu.h b/intern/cycles/kernel/osl/services_gpu.h
new file mode 100644
index 00000000000..e6e19b8c484
--- /dev/null
+++ b/intern/cycles/kernel/osl/services_gpu.h
@@ -0,0 +1,2149 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */
+
+#include "kernel/tables.h"
+#include "kernel/util/differential.h"
+
+#include "kernel/osl/osl.h"
+
+namespace DeviceStrings {
+
+/* "" */
+ccl_device_constant DeviceString _emptystring_ = {0ull};
+/* "NDC" */
+ccl_device_constant DeviceString u_ndc = {5148305047403260775ull};
+/* "screen" */
+ccl_device_constant DeviceString u_screen = {14159088609039777114ull};
+/* "camera" */
+ccl_device_constant DeviceString u_camera = {2159505832145726196ull};
+/* "raster" */
+ccl_device_constant DeviceString u_raster = {7759263238610201778ull};
+/* "world" */
+ccl_device_constant DeviceString u_world = {16436542438370751598ull};
+/* "common" */
+ccl_device_constant DeviceString u_common = {14645198576927606093ull};
+/* "hsv" */
+ccl_device_constant DeviceString u_hsv = {2177035556331879497ull};
+/* "hsl" */
+ccl_device_constant DeviceString u_hsl = {7749766809258288148ull};
+/* "XYZ" */
+ccl_device_constant DeviceString u_xyz = {4957977063494975483ull};
+/* "xyY" */
+ccl_device_constant DeviceString u_xyy = {5138822319725660255ull};
+/* "sRGB" */
+ccl_device_constant DeviceString u_srgb = {15368599878474175032ull};
+/* "object:location" */
+ccl_device_constant DeviceString u_object_location = {7846190347358762897ull};
+/* "object:color" */
+ccl_device_constant DeviceString u_object_color = {12695623857059169556ull};
+/* "object:alpha" */
+ccl_device_constant DeviceString u_object_alpha = {11165053919428293151ull};
+/* "object:index" */
+ccl_device_constant DeviceString u_object_index = {6588325838217472556ull};
+/* "geom:dupli_generated" */
+ccl_device_constant DeviceString u_geom_dupli_generated = {6715607178003388908ull};
+/* "geom:dupli_uv" */
+ccl_device_constant DeviceString u_geom_dupli_uv = {1294253317490155849ull};
+/* "material:index" */
+ccl_device_constant DeviceString u_material_index = {741770758159634623ull};
+/* "object:random" */
+ccl_device_constant DeviceString u_object_random = {15789063994977955884ull};
+/* "particle:index" */
+ccl_device_constant DeviceString u_particle_index = {9489711748229903784ull};
+/* "particle:random" */
+ccl_device_constant DeviceString u_particle_random = {17993722202766855761ull};
+/* "particle:age" */
+ccl_device_constant DeviceString u_particle_age = {7380730644710951109ull};
+/* "particle:lifetime" */
+ccl_device_constant DeviceString u_particle_lifetime = {16576828923156200061ull};
+/* "particle:location" */
+ccl_device_constant DeviceString u_particle_location = {10309536211423573010ull};
+/* "particle:rotation" */
+ccl_device_constant DeviceString u_particle_rotation = {17858543768041168459ull};
+/* "particle:size" */
+ccl_device_constant DeviceString u_particle_size = {16461524249715420389ull};
+/* "particle:velocity" */
+ccl_device_constant DeviceString u_particle_velocity = {13199101248768308863ull};
+/* "particle:angular_velocity" */
+ccl_device_constant DeviceString u_particle_angular_velocity = {16327930120486517910ull};
+/* "geom:numpolyvertices" */
+ccl_device_constant DeviceString u_geom_numpolyvertices = {382043551489988826ull};
+/* "geom:trianglevertices" */
+ccl_device_constant DeviceString u_geom_trianglevertices = {17839267571524187074ull};
+/* "geom:polyvertices" */
+ccl_device_constant DeviceString u_geom_polyvertices = {1345577201967881769ull};
+/* "geom:name" */
+ccl_device_constant DeviceString u_geom_name = {13606338128269760050ull};
+/* "geom:undisplaced" */
+ccl_device_constant DeviceString u_geom_undisplaced = {12431586303019276305ull};
+/* "geom:is_smooth" */
+ccl_device_constant DeviceString u_is_smooth = {857544214094480123ull};
+/* "geom:is_curve" */
+ccl_device_constant DeviceString u_is_curve = {129742495633653138ull};
+/* "geom:curve_thickness" */
+ccl_device_constant DeviceString u_curve_thickness = {10605802038397633852ull};
+/* "geom:curve_length" */
+ccl_device_constant DeviceString u_curve_length = {11423459517663715453ull};
+/* "geom:curve_tangent_normal" */
+ccl_device_constant DeviceString u_curve_tangent_normal = {12301397394034985633ull};
+/* "geom:curve_random" */
+ccl_device_constant DeviceString u_curve_random = {15293085049960492358ull};
+/* "geom:is_point" */
+ccl_device_constant DeviceString u_is_point = {2511357849436175953ull};
+/* "geom:point_radius" */
+ccl_device_constant DeviceString u_point_radius = {9956381140398668479ull};
+/* "geom:point_position" */
+ccl_device_constant DeviceString u_point_position = {15684484280742966916ull};
+/* "geom:point_random" */
+ccl_device_constant DeviceString u_point_random = {5632627207092325544ull};
+/* "geom:normal_map_normal" */
+ccl_device_constant DeviceString u_normal_map_normal = {10718948685686827073};
+/* "path:ray_length" */
+ccl_device_constant DeviceString u_path_ray_length = {16391985802412544524ull};
+/* "path:ray_depth" */
+ccl_device_constant DeviceString u_path_ray_depth = {16643933224879500399ull};
+/* "path:diffuse_depth" */
+ccl_device_constant DeviceString u_path_diffuse_depth = {13191651286699118408ull};
+/* "path:glossy_depth" */
+ccl_device_constant DeviceString u_path_glossy_depth = {15717768399057252940ull};
+/* "path:transparent_depth" */
+ccl_device_constant DeviceString u_path_transparent_depth = {7821650266475578543ull};
+/* "path:transmission_depth" */
+ccl_device_constant DeviceString u_path_transmission_depth = {15113408892323917624ull};
+
+}  // namespace DeviceStrings
+
+/* Closure */
+
+ccl_device_extern ccl_private OSLClosure *osl_mul_closure_color(ccl_private ShaderGlobals *sg,
+                                                                ccl_private OSLClosure *a,
+                                                                ccl_private const float3 *weight)
+{
+  if (*weight == zero_float3() || !a) {
+    return nullptr;
+  }
+  else if (*weight == one_float3()) {
+    return a;
+  }
+
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) &
+      (-alignof(OSLClosureMul)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul);
+
+  ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>(
+      closure_pool);
+  closure->id = OSL_CLOSURE_MUL_ID;
+  closure->weight = *weight;
+  closure->closure = a;
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_mul_closure_float(ccl_private ShaderGlobals *sg,
+                                                                ccl_private OSLClosure *a,
+                                                                float weight)
+{
+  if (weight == 0.0f || !a) {
+    return nullptr;
+  }
+  else if (weight == 1.0f) {
+    return a;
+  }
+
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) &
+      (-alignof(OSLClosureMul)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul);
+
+  ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>(
+      closure_pool);
+  closure->id = OSL_CLOSURE_MUL_ID;
+  closure->weight = make_float3(weight, weight, weight);
+  closure->closure = a;
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_add_closure_closure(ccl_private ShaderGlobals *sg,
+                                                                  ccl_private OSLClosure *a,
+                                                                  ccl_private OSLClosure *b)
+{
+  if (!a) {
+    return b;
+  }
+  if (!b) {
+    return a;
+  }
+
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureAdd) - 1) &
+      (-alignof(OSLClosureAdd)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureAdd);
+
+  ccl_private OSLClosureAdd *const closure = reinterpret_cast<ccl_private OSLClosureAdd *>(
+      closure_pool);
+  closure->id = OSL_CLOSURE_ADD_ID;
+  closure->closureA = a;
+  closure->closureB = b;
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_allocate_closure_component(
+    ccl_private ShaderGlobals *sg, int id, int size)
+{
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) &
+      (-alignof(OSLClosureComponent)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size;
+
+  ccl_private OSLClosureComponent *const closure =
+      reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool);
+  closure->id = static_cast<OSLClosureType>(id);
+  closure->weight = one_float3();
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_allocate_weighted_closure_component(
+    ccl_private ShaderGlobals *sg, int id, int size, ccl_private const float3 *weight)
+{
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) &
+      (-alignof(OSLClosureComponent)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size;
+
+  ccl_private OSLClosureComponent *const closure =
+      reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool);
+  closure->id = static_cast<OSLClosureType>(id);
+  closure->weight = *weight;
+
+  return closure;
+}
+
+/* Utilities */
+
+#include "kernel/svm/math_util.h"
+#include "kernel/util/color.h"
+
+ccl_device_extern void osl_error(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern void osl_printf(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern void osl_warning(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern uint osl_range_check(int indexvalue,
+                                       int length,
+                                       DeviceString symname,
+                                       ccl_private ShaderGlobals *sg,
+                                       DeviceString sourcefile,
+                                       int sourceline,
+                                       DeviceString groupname,
+                                       int layer,
+                                       DeviceString layername,
+                                       DeviceString shadername)
+{
+  const int result = indexvalue < 0 ? 0 : indexvalue >= length ? length - 1 : indexvalue;
+#if 0
+  if (result != indexvalue) {
+    printf("Index [%d] out of range\n", indexvalue);
+  }
+#endif
+  return result;
+}
+
+ccl_device_extern uint osl_range_check_err(int indexvalue,
+                                           int length,
+                                           DeviceString symname,
+                                           ccl_private ShaderGlobals *sg,
+                                           DeviceString sourcefile,
+                                           int sourceline,
+                                           DeviceString groupname,
+                                           int layer,
+                                           DeviceString layername,
+                                           DeviceString shadername)
+{
+  return osl_range_check(indexvalue,
+                         length,
+                         symname,
+                         sg,
+                         sourcefile,
+                         sourceline,
+                         groupname,
+                         layer,
+                         layername,
+                         shadername);
+}
+
+/* Color Utilities */
+
+ccl_device_extern void osl_blackbody_vf(ccl_private ShaderGlobals *sg,
+                                        ccl_private float3 *result,
+                                        float temperature)
+{
+  float3 color_rgb = rec709_to_rgb(nullptr, svm_math_blackbody_color_rec709(temperature));
+  color_rgb = max(color_rgb, zero_float3());
+  *result = color_rgb;
+}
+
+#if 0
+ccl_device_extern void osl_wavelength_color_vf(ccl_private ShaderGlobals *sg,
+                                                   ccl_private float3 *result,
+                                                   float wavelength)
+{
+}
+#endif
+
+ccl_device_extern void osl_luminance_fv(ccl_private ShaderGlobals *sg,
+                                        ccl_private float *result,
+                                        ccl_private float3 *color)
+{
+  *result = linear_rgb_to_gray(nullptr, *color);
+}
+
+ccl_device_extern void osl_luminance_dfdv(ccl_private ShaderGlobals *sg,
+                                          ccl_private float *result,
+                                          ccl_private float3 *color)
+{
+  for (int i = 0; i < 3; ++i) {
+    osl_luminance_fv(sg, result + i, color + i);
+  }
+}
+
+ccl_device_extern void osl_prepend_color_from(ccl_private ShaderGlobals *sg,
+                                              ccl_private float3 *res,
+                                              DeviceString from)
+{
+  if (from == DeviceStrings::u_hsv) {
+    *res = hsv_to_rgb(*res);
+  }
+  else if (from == DeviceStrings::u_hsl) {
+    *res = hsl_to_rgb(*res);
+  }
+  else if (from == DeviceStrings::u_xyz) {
+    *res = xyz_to_rgb(nullptr, *res);
+  }
+  else if (from == DeviceStrings::u_xyy) {
+    *res = xyz_to_rgb(nullptr, xyY_to_xyz(res->x, res->y, res->z));
+  }
+}
+
+ccl_device_extern bool osl_transformc(ccl_private ShaderGlobals *sg,
+                                      ccl_private float3 *c_in,
+                                      int c_in_derivs,
+                                      ccl_private float3 *c_out,
+                                      int c_out_derivs,
+                                      DeviceString from,
+                                      DeviceString to)
+{
+  if (!c_out_derivs) {
+    c_in_derivs = false;
+  }
+  else if (!c_in_derivs) {
+    c_out[1] = zero_float3();
+    c_out[2] = zero_float3();
+  }
+
+  float3 rgb;
+
+  for (int i = 0; i < (c_in_derivs ? 3 : 1); ++i) {
+    if (from == DeviceStrings::u_hsv) {
+      rgb = hsv_to_rgb(c_in[i]);
+    }
+    else if (from == DeviceStrings::u_hsl) {
+      rgb = hsl_to_rgb(c_in[i]);
+    }
+    else if (from == DeviceStrings::u_xyz) {
+      rgb = xyz_to_rgb(nullptr, c_in[i]);
+    }
+    else if (from == DeviceStrings::u_xyy) {
+      rgb = xyz_to_rgb(nullptr, xyY_to_xyz(c_in[i].x, c_in[i].y, c_in[i].z));
+    }
+    else if (from == DeviceStrings::u_srgb) {
+      rgb = color_srgb_to_linear_v3(c_in[i]);
+    }
+    else {
+      rgb = c_in[i];
+    }
+
+    if (to == DeviceStrings::u_hsv) {
+      c_out[i] = rgb_to_hsv(rgb);
+    }
+    else if (to == DeviceStrings::u_hsl) {
+      c_out[i] = rgb_to_hsl(rgb);
+    }
+#if 0
+    else if (to == DeviceStrings::u_xyz) {
+      c_out[i] = rgb_to_xyz(nullptr, rgb);
+    }
+    else if (to == DeviceStrings::u_xyy) {
+      c_out[i] = xyz_to_xyY(rgb_to_xyz(nullptr, rgb));
+    }
+#endif
+    else if (to == DeviceStrings::u_srgb) {
+      c_out[i] = color_linear_to_srgb_v3(rgb);
+    }
+    else {
+      c_out[i] = rgb;
+    }
+  }
+}
+
+/* Matrix Utilities */
+
+#include "util/transform.h"
+
+ccl_device_forceinline void copy_matrix(ccl_private float *res, const Transform &tfm)
+{
+  res[0] = tfm.x.x;
+  res[1] = tfm.y.x;
+  res[2] = tfm.z.x;
+  res[3] = 0.0f;
+  res[4] = tfm.x.y;
+  res[5] = tfm.y.y;
+  res[6] = tfm.z.y;
+  res[7] = 0.0f;
+  res[8] = tfm.x.z;
+  res[9] = tfm.y.z;
+  res[10] = tfm.z.z;
+  res[11] = 0.0f;
+  res[12] = tfm.x.w;
+  res[13] = tfm.y.w;
+  res[14] = tfm.z.w;
+  res[15] = 1.0f;
+}
+ccl_device_forceinline void copy_matrix(ccl_private float *res, const ProjectionTransform &tfm)
+{
+  res[0] = tfm.x.x;
+  res[1] = tfm.y.x;
+  res[2] = tfm.z.x;
+  res[3] = tfm.w.x;
+  res[4] = tfm.x.y;
+  res[5] = tfm.y.y;
+  res[6] = tfm.z.y;
+  res[7] = tfm.w.y;
+  res[8] = tfm.x.z;
+  res[9] = tfm.y.z;
+  res[10] = tfm.z.z;
+  res[11] = tfm.w.z;
+  res[12] = tfm.x.w;
+  res[13] = tfm.y.w;
+  res[14] = tfm.z.w;
+  res[15] = tfm.w.w;
+}
+ccl_device_forceinline void copy_identity_matrix(ccl_private float *res)
+{
+  res[0] = 1.0f;
+  res[1] = 0.0f;
+  res[2] = 0.0f;
+  res[3] = 0.0f;
+  res[4] = 0.0f;
+  res[5] = 1.0f;
+  res[6] = 0.0f;
+  res[7] = 0.0f;
+  res[8] = 0.0f;
+  res[9] = 0.0f;
+  res[10] = 1.0f;
+  res[11] = 0.0f;
+  res[12] = 0.0f;
+  res[13] = 0.0f;
+  res[14] = 0.0f;
+  res[15] = 1.0f;
+}
+ccl_device_forceinline Transform convert_transform(ccl_private const float *m)
+{
+  return make_transform(
+      m[0], m[4], m[8], m[12], m[1], m[5], m[9], m[13], m[2], m[6], m[10], m[14]);
+}
+
+ccl_device_extern void osl_mul_mmm(ccl_private float *res,
+                                   ccl_private const float *a,
+                                   ccl_private const float *b)
+{
+  const Transform tfm_a = convert_transform(a);
+  const Transform tfm_b = convert_transform(b);
+  copy_matrix(res, tfm_a * tfm_b);
+}
+
+ccl_device_extern void osl_mul_mmf(ccl_private float *res, ccl_private const float *a, float b)
+{
+  for (int i = 0; i < 16; ++i) {
+    res[i] = a[i] * b;
+  }
+}
+
+ccl_device_extern void osl_div_mmm(ccl_private float *res,
+                                   ccl_private const float *a,
+                                   ccl_private const float *b)
+{
+  const Transform tfm_a = convert_transform(a);
+  const Transform tfm_b = convert_transform(b);
+  copy_matrix(res, tfm_a * transform_inverse(tfm_b));
+}
+
+ccl_device_extern void osl_div_mmf(ccl_private float *res, ccl_private const float *a, float b)
+{
+  for (int i = 0; i < 16; ++i) {
+    res[i] = a[i] / b;
+  }
+}
+
+ccl_device_extern void osl_div_mfm(ccl_private float *res, float a, ccl_private const float *b)
+{
+  const Transform tfm_b = convert_transform(b);
+  copy_matrix(res, transform_inverse(tfm_b));
+  for (int i = 0; i < 16; ++i) {
+    res[i] *= a;
+  }
+}
+
+ccl_device_extern void osl_div_m_ff(ccl_private float *res, float a, float b)
+{
+  float f = (b == 0) ? 0.0f : (a / b);
+  res[0] = f;
+  res[1] = 0.0f;
+  res[2] = 0.0f;
+  res[3] = 0.0f;
+  res[4] = 0.0f;
+  res[5] = f;
+  res[6] = 0.0f;
+  res[7] = 0.0f;
+  res[8] = 0.0f;
+  res[9] = 0.0f;
+  res[10] = f;
+  res[11] = 0.0f;
+  res[12] = 0.0f;
+  res[13] = 0.0f;
+  res[14] = 0.0f;
+  res[15] = f;
+}
+
+ccl_device_extern void osl_transform_vmv(ccl_private float3 *res,
+                                         ccl_private const float *m,
+                                         ccl_private const float3 *v)
+{
+  const Transform tfm_m = convert_transform(m);
+  *res = transform_point(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transform_dvmdv(ccl_private float3 *res,
+                                           ccl_private const float *m,
+                                           ccl_private const float3 *v)
+{
+  for (int i = 0; i < 3; ++i) {
+    const Transform tfm_m = convert_transform(m + i * 16);
+    res[i] = transform_point(&tfm_m, v[i]);
+  }
+}
+
+ccl_device_extern void osl_transformv_vmv(ccl_private float3 *res,
+                                          ccl_private const float *m,
+                                          ccl_private const float3 *v)
+{
+  const Transform tfm_m = convert_transform(m);
+  *res = transform_direction(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transformv_dvmdv(ccl_private float3 *res,
+                                            ccl_private const float *m,
+                                            ccl_private const float3 *v)
+{
+  for (int i = 0; i < 3; ++i) {
+    const Transform tfm_m = convert_transform(m + i * 16);
+    res[i] = transform_direction(&tfm_m, v[i]);
+  }
+}
+
+ccl_device_extern void osl_transformn_vmv(ccl_private float3 *res,
+                                          ccl_private const float *m,
+                                          ccl_private const float3 *v)
+{
+  const Transform tfm_m = convert_transform(m);
+  *res = transform_direction(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transformn_dvmdv(ccl_private float3 *res,
+                                            ccl_private const float *m,
+                                            ccl_private const float3 *v)
+{
+  for (int i = 0; i < 3; ++i) {
+    const Transform tfm_m = convert_transform(m + i * 16);
+    res[i] = transform_direction(&tfm_m, v[i]);
+  }
+}
+
+ccl_device_extern bool osl_get_matrix(ccl_private ShaderGlobals *sg,
+                                      ccl_private float *result,
+                                      DeviceString from)
+{
+  if (from == DeviceStrings::u_ndc) {
+    copy_matrix(result, kernel_data.cam.ndctoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_raster) {
+    copy_matrix(result, kernel_data.cam.rastertoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_screen) {
+    copy_matrix(result, kernel_data.cam.screentoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_camera) {
+    copy_matrix(result, kernel_data.cam.cameratoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_world) {
+    copy_identity_matrix(result);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_extern bool osl_get_inverse_matrix(ccl_private ShaderGlobals *sg,
+                                              ccl_private float *res,
+                                              DeviceString to)
+{
+  if (to == DeviceStrings::u_ndc) {
+    copy_matrix(res, kernel_data.cam.worldtondc);
+    return true;
+  }
+  if (to == DeviceStrings::u_raster) {
+    copy_matrix(res, kernel_data.cam.worldtoraster);
+    return true;
+  }
+  if (to == DeviceStrings::u_screen) {
+    copy_matrix(res, kernel_data.cam.worldtoscreen);
+    return true;
+  }
+  if (to == DeviceStrings::u_camera) {
+    copy_matrix(res, kernel_data.cam.worldtocamera);
+    return true;
+  }
+  if (to == DeviceStrings::u_world) {
+    copy_identity_matrix(res);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_extern bool osl_get_from_to_matrix(ccl_private ShaderGlobals *sg,
+                                              ccl_private float *res,
+                                              DeviceString from,
+                                              DeviceString to)
+{
+  float m_from[16], m_to[16];
+  if (osl_get_matrix(sg, m_from, from) && osl_get_inverse_matrix(sg, m_to, to)) {
+    osl_mul_mmm(res, m_from, m_to);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_extern void osl_prepend_matrix_from(ccl_private ShaderGlobals *sg,
+                                               ccl_private float *res,
+                                               DeviceString from)
+{
+  float m[16];
+  if (osl_get_matrix(sg, m, from)) {
+    osl_mul_mmm(res, m, res);
+  }
+}
+
+ccl_device_extern bool osl_transform_triple(ccl_private ShaderGlobals *sg,
+                                            ccl_private float3 *p_in,
+                                            int p_in_derivs,
+                                            ccl_private float3 *p_out,
+                                            int p_out_derivs,
+                                            DeviceString from,
+                                            DeviceString to,
+                                            int vectype)
+{
+  if (!p_out_derivs) {
+    p_in_derivs = false;
+  }
+  else if (!p_in_derivs) {
+    p_out[1] = zero_float3();
+    p_out[2] = zero_float3();
+  }
+
+  bool res;
+  float m[16];
+
+  if (from == DeviceStrings::u_common) {
+    res = osl_get_inverse_matrix(sg, m, to);
+  }
+  else if (to == DeviceStrings::u_common) {
+    res = osl_get_matrix(sg, m, from);
+  }
+  else {
+    res = osl_get_from_to_matrix(sg, m, from, to);
+  }
+
+  if (res) {
+    if (vectype == 2 /* TypeDesc::POINT */) {
+      if (p_in_derivs)
+        osl_transform_dvmdv(p_out, m, p_in);
+      else
+        osl_transform_vmv(p_out, m, p_in);
+    }
+    else if (vectype == 3 /* TypeDesc::VECTOR */) {
+      if (p_in_derivs)
+        osl_transformv_dvmdv(p_out, m, p_in);
+      else
+        osl_transformv_vmv(p_out, m, p_in);
+    }
+    else if (vectype == 4 /* TypeDesc::NORMAL */) {
+      if (p_in_derivs)
+        osl_transformn_dvmdv(p_out, m, p_in);
+      else
+        osl_transformn_vmv(p_out, m, p_in);
+    }
+    else {
+      res = false;
+    }
+  }
+  else {
+    p_out[0] = p_in[0];
+    if (p_in_derivs) {
+      p_out[1] = p_in[1];
+      p_out[2] = p_in[2];
+    }
+  }
+
+  return res;
+}
+
+ccl_device_extern bool osl_transform_triple_nonlinear(ccl_private ShaderGlobals *sg,
+                                                      ccl_private float3 *p_in,
+                                                      int p_in_derivs,
+                                                      ccl_private float3 *p_out,
+                                                      int p_out_derivs,
+                                                      DeviceString from,
+                                                      DeviceString to,
+                                                      int vectype)
+{
+  return osl_transform_triple(sg, p_in, p_in_derivs, p_out, p_out_derivs, from, to, vectype);
+}
+
+ccl_device_extern void osl_transpose_mm(ccl_private float *res, ccl_private const float *m)
+{
+  copy_matrix(res, *reinterpret_cast<ccl_private const ProjectionTransform *>(m));
+}
+
+#if 0
+ccl_device_extern float osl_determinant_fm(ccl_private const float *m)
+{
+}
+#endif
+
+/* Attributes */
+
+#include "kernel/geom/geom.h"
+
+typedef long long TypeDesc;
+
+ccl_device_inline bool set_attribute_float(ccl_private float fval[3],
+                                           TypeDesc type,
+                                           bool derivatives,
+                                           ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 2 /* TypeDesc::VEC2 */) ||
+        (type_aggregate == 1 && type_arraylen == 2)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i];
+      }
+      return true;
+    }
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i];
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = fval[i];
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_float(float f,
+                                           TypeDesc type,
+                                           bool derivatives,
+                                           ccl_private void *val)
+{
+  float fv[3];
+
+  fv[0] = f;
+  fv[1] = 0.0f;
+  fv[2] = 0.0f;
+
+  return set_attribute_float(fv, type, derivatives, val);
+}
+ccl_device_inline bool set_attribute_float2(ccl_private float2 fval[3],
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 2 /* TypeDesc::VEC2 */) ||
+        (type_aggregate == 1 && type_arraylen == 2)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i].y;
+      }
+      return true;
+    }
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = 0.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = 0.0f;
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = fval[i].x;
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_float3(ccl_private float3 fval[3],
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z;
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z;
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = average(fval[i]);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_float3(float3 f,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  float3 fv[3];
+
+  fv[0] = f;
+  fv[1] = make_float3(0.0f, 0.0f, 0.0f);
+  fv[2] = make_float3(0.0f, 0.0f, 0.0f);
+
+  return set_attribute_float3(fv, type, derivatives, val);
+}
+ccl_device_inline bool set_attribute_float4(ccl_private float4 fval[3],
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z;
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z;
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = fval[i].w;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = average(float4_to_float3(fval[i]));
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_matrix(ccl_private const Transform &tfm,
+                                            TypeDesc type,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */ && type_aggregate == 16 /* TypeDesc::MATRIX44 */) {
+    copy_matrix(static_cast<ccl_private float *>(val), tfm);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_inline bool get_background_attribute(KernelGlobals kg,
+                                                ccl_private ShaderData *sd,
+                                                DeviceString name,
+                                                TypeDesc type,
+                                                bool derivatives,
+                                                ccl_private void *val)
+{
+  if (name == DeviceStrings::u_path_ray_length) {
+    /* Ray Length */
+    float f = sd->ray_length;
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+  return false;
+}
+
+ccl_device_inline bool get_object_attribute(KernelGlobals kg,
+                                            ccl_private ShaderData *sd,
+                                            const AttributeDescriptor &desc,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  if (desc.type == NODE_ATTR_FLOAT) {
+    float fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      fval[0] = primitive_volume_attribute_float(kg, sd, desc);
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_FLOAT2) {
+    float2 fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      return false;
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float2(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float2(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_FLOAT3) {
+    float3 fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      fval[0] = primitive_volume_attribute_float3(kg, sd, desc);
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float3(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float3(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_FLOAT4 || desc.type == NODE_ATTR_RGBA) {
+    float4 fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      fval[0] = primitive_volume_attribute_float4(kg, sd, desc);
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float4(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float4(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_MATRIX) {
+    Transform tfm = primitive_attribute_matrix(kg, desc);
+    return set_attribute_matrix(tfm, type, val);
+  }
+
+  return false;
+}
+
+ccl_device_inline bool get_object_standard_attribute(KernelGlobals kg,
+                                                     ccl_private ShaderData *sd,
+                                                     DeviceString name,
+                                                     TypeDesc type,
+                                                     bool derivatives,
+                                                     ccl_private void *val)
+{
+  /* Object attributes */
+  if (name == DeviceStrings::u_object_location) {
+    float3 f = object_location(kg, sd);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_color) {
+    float3 f = object_color(kg, sd->object);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_alpha) {
+    float f = object_alpha(kg, sd->object);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_index) {
+    float f = object_pass_id(kg, sd->object);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_geom_dupli_generated) {
+    float3 f = object_dupli_generated(kg, sd->object);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_geom_dupli_uv) {
+    float3 f = object_dupli_uv(kg, sd->object);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_material_index) {
+    float f = shader_pass_id(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_random) {
+    float f = object_random_number(kg, sd->object);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+  /* Particle attributes */
+  else if (name == DeviceStrings::u_particle_index) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_index(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_random) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = hash_uint2_to_float(particle_index(kg, particle_id), 0);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+  else if (name == DeviceStrings::u_particle_age) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_age(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_lifetime) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_lifetime(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_location) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float3 f = particle_location(kg, particle_id);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+#if 0 /* unsupported */
+  else if (name == DeviceStrings::u_particle_rotation) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float4 f = particle_rotation(kg, particle_id);
+    return set_attribute_float4(f, type, derivatives, val);
+  }
+#endif
+  else if (name == DeviceStrings::u_particle_size) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_size(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_velocity) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float3 f = particle_velocity(kg, particle_id);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_angular_velocity) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float3 f = particle_angular_velocity(kg, particle_id);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+
+  /* Geometry attributes */
+#if 0 /* TODO */
+  else if (name == DeviceStrings::u_geom_numpolyvertices) {
+    return false;
+  }
+  else if (name == DeviceStrings::u_geom_trianglevertices ||
+            name == DeviceStrings::u_geom_polyvertices) {
+    return false;
+  }
+  else if (name == DeviceStrings::u_geom_name) {
+    return false;
+  }
+#endif
+  else if (name == DeviceStrings::u_is_smooth) {
+    float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+#ifdef __HAIR__
+  /* Hair attributes */
+  else if (name == DeviceStrings::u_is_curve) {
+    float f = (sd->type & PRIMITIVE_CURVE) != 0;
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_curve_thickness) {
+    float f = curve_thickness(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_curve_tangent_normal) {
+    float3 f = curve_tangent_normal(kg, sd);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_curve_random) {
+    float f = curve_random(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+#endif
+
+#ifdef __POINTCLOUD__
+  /* Point attributes */
+  else if (name == DeviceStrings::u_is_point) {
+    float f = (sd->type & PRIMITIVE_POINT) != 0;
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_point_radius) {
+    float f = point_radius(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_point_position) {
+    float3 f = point_position(kg, sd);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_point_random) {
+    float f = point_random(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+#endif
+
+  else if (name == DeviceStrings::u_normal_map_normal) {
+    if (sd->type & PRIMITIVE_TRIANGLE) {
+      float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v);
+      return set_attribute_float3(f, type, derivatives, val);
+    }
+    else {
+      return false;
+    }
+  }
+
+  return get_background_attribute(kg, sd, name, type, derivatives, val);
+}
+
+ccl_device_extern bool osl_get_attribute(ccl_private ShaderGlobals *sg,
+                                         int derivatives,
+                                         DeviceString object_name,
+                                         DeviceString name,
+                                         int array_lookup,
+                                         int index,
+                                         TypeDesc type,
+                                         ccl_private void *res)
+{
+  KernelGlobals kg = nullptr;
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+  int object;
+
+  if (object_name != DeviceStrings::_emptystring_) {
+    /* TODO: Get object index from name */
+    return false;
+  }
+  else {
+    object = sd->object;
+  }
+
+  const uint64_t id = name.hash();
+
+  const AttributeDescriptor desc = find_attribute(kg, object, sd->prim, sd->type, id);
+  if (desc.offset != ATTR_STD_NOT_FOUND) {
+    return get_object_attribute(kg, sd, desc, type, derivatives, res);
+  }
+  else {
+    return get_object_standard_attribute(kg, sd, name, type, derivatives, res);
+  }
+}
+
+#if 0
+ccl_device_extern bool osl_bind_interpolated_param(ccl_private ShaderGlobals *sg,
+                                                       DeviceString name,
+                                                       long long type,
+                                                       int userdata_has_derivs,
+                                                       ccl_private void *userdata_data,
+                                                       int symbol_has_derivs,
+                                                       ccl_private void *symbol_data,
+                                                       int symbol_data_size,
+                                                       ccl_private void *userdata_initialized,
+                                                       int userdata_index)
+{
+  return false;
+}
+#endif
+
+/* Noise */
+
+#include "kernel/svm/noise.h"
+#include "util/hash.h"
+
+ccl_device_extern uint osl_hash_ii(int x)
+{
+  return hash_uint(x);
+}
+
+ccl_device_extern uint osl_hash_if(float x)
+{
+  return hash_uint(__float_as_uint(x));
+}
+
+ccl_device_extern uint osl_hash_iff(float x, float y)
+{
+  return hash_uint2(__float_as_uint(x), __float_as_uint(y));
+}
+
+ccl_device_extern uint osl_hash_iv(ccl_private const float3 *v)
+{
+  return hash_uint3(__float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z));
+}
+
+ccl_device_extern uint osl_hash_ivf(ccl_private const float3 *v, float w)
+{
+  return hash_uint4(
+      __float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z), __float_as_uint(w));
+}
+
+ccl_device_extern OSLNoiseOptions *osl_get_noise_options(ccl_private ShaderGlobals *sg)
+{
+  return nullptr;
+}
+
+ccl_device_extern void osl_noiseparams_set_anisotropic(ccl_private OSLNoiseOptions *opt,
+                                                       int anisotropic)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_do_filter(ccl_private OSLNoiseOptions *opt,
+                                                     int do_filter)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_direction(ccl_private OSLNoiseOptions *opt,
+                                                     float3 *direction)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_bandwidth(ccl_private OSLNoiseOptions *opt,
+                                                     float bandwidth)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_impulses(ccl_private OSLNoiseOptions *opt,
+                                                    float impulses)
+{
+}
+
+#define OSL_NOISE_IMPL(name, op) \
+  ccl_device_extern float name##_ff(float x) \
+  { \
+    return op##_1d(x); \
+  } \
+  ccl_device_extern float name##_fff(float x, float y) \
+  { \
+    return op##_2d(make_float2(x, y)); \
+  } \
+  ccl_device_extern float name##_fv(ccl_private const float3 *v) \
+  { \
+    return op##_3d(*v); \
+  } \
+  ccl_device_extern float name##_fvf(ccl_private const float3 *v, float w) \
+  { \
+    return op##_4d(make_float4(v->x, v->y, v->z, w)); \
+  } \
+  ccl_device_extern void name##_vf(ccl_private float3 *res, float x) \
+  { \
+    /* TODO: This is not correct. Really need to change the hash function inside the noise \
+     * function to spit out a vector instead of a scalar. */ \
+    const float n = name##_ff(x); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  } \
+  ccl_device_extern void name##_vff(ccl_private float3 *res, float x, float y) \
+  { \
+    const float n = name##_fff(x, y); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  } \
+  ccl_device_extern void name##_vv(ccl_private float3 *res, const float3 *v) \
+  { \
+    const float n = name##_fv(v); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  } \
+  ccl_device_extern void name##_vvf(ccl_private float3 *res, const float3 *v, float w) \
+  { \
+    const float n = name##_fvf(v, w); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  }
+
+ccl_device_forceinline float hashnoise_1d(float p)
+{
+  const uint x = __float_as_uint(p);
+  return hash_uint(x) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_2d(float2 p)
+{
+  const uint x = __float_as_uint(p.x);
+  const uint y = __float_as_uint(p.y);
+  return hash_uint2(x, y) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_3d(float3 p)
+{
+  const uint x = __float_as_uint(p.x);
+  const uint y = __float_as_uint(p.y);
+  const uint z = __float_as_uint(p.z);
+  return hash_uint3(x, y, z) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_4d(float4 p)
+{
+  const uint x = __float_as_uint(p.x);
+  const uint y = __float_as_uint(p.y);
+  const uint z = __float_as_uint(p.z);
+  const uint w = __float_as_uint(p.w);
+  return hash_uint4(x, y, z, w) / static_cast<float>(~0u);
+}
+
+/* TODO: Implement all noise functions */
+OSL_NOISE_IMPL(osl_hashnoise, hashnoise)
+OSL_NOISE_IMPL(osl_noise, noise)
+OSL_NOISE_IMPL(osl_snoise, snoise)
+
+/* Texturing */
+
+ccl_device_extern ccl_private OSLTextureOptions *osl_get_texture_options(
+    ccl_private ShaderGlobals *sg)
+{
+  return nullptr;
+}
+
+ccl_device_extern void osl_texture_set_firstchannel(ccl_private OSLTextureOptions *opt,
+                                                    int firstchannel)
+{
+}
+
+ccl_device_extern void osl_texture_set_swrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_twrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_rwrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_stwrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_sblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_tblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_rblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_stblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_swidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_twidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_rwidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_stwidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_fill(ccl_private OSLTextureOptions *opt, float fill)
+{
+}
+
+ccl_device_extern void osl_texture_set_time(ccl_private OSLTextureOptions *opt, float time)
+{
+}
+
+ccl_device_extern void osl_texture_set_interp_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_subimage(ccl_private OSLTextureOptions *opt, int subimage)
+{
+}
+
+ccl_device_extern void osl_texture_set_missingcolor_arena(ccl_private OSLTextureOptions *opt,
+                                                          ccl_private float3 *color)
+{
+}
+
+ccl_device_extern void osl_texture_set_missingcolor_alpha(ccl_private OSLTextureOptions *opt,
+                                                          int nchannels,
+                                                          float alpha)
+{
+}
+
+ccl_device_extern bool osl_texture(ccl_private ShaderGlobals *sg,
+                                   DeviceString filename,
+                                   ccl_private void *texture_handle,
+                                   OSLTextureOptions *opt,
+                                   float s,
+                                   float t,
+                                   float dsdx,
+                                   float dtdx,
+                                   float dsdy,
+                                   float dtdy,
+                                   int nchannels,
+                                   ccl_private float *result,
+                                   ccl_private float *dresultdx,
+                                   ccl_private float *dresultdy,
+                                   ccl_private float *alpha,
+                                   ccl_private float *dalphadx,
+                                   ccl_private float *dalphady,
+                                   ccl_private void *errormessage)
+{
+  if (!texture_handle) {
+    return false;
+  }
+
+  /* Only SVM textures are supported. */
+  int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1);
+
+  const float4 rgba = kernel_tex_image_interp(nullptr, id, s, 1.0f - t);
+
+  result[0] = rgba.x;
+  if (nchannels > 1)
+    result[1] = rgba.y;
+  if (nchannels > 2)
+    result[2] = rgba.z;
+  if (nchannels > 3)
+    result[3] = rgba.w;
+
+  return true;
+}
+
+ccl_device_extern bool osl_texture3d(ccl_private ShaderGlobals *sg,
+                                     DeviceString filename,
+                                     ccl_private void *texture_handle,
+                                     OSLTextureOptions *opt,
+                                     ccl_private const float3 *P,
+                                     ccl_private const float3 *dPdx,
+                                     ccl_private const float3 *dPdy,
+                                     ccl_private const float3 *dPdz,
+                                     int nchannels,
+                                     ccl_private float *result,
+                                     ccl_private float *dresultds,
+                                     ccl_private float *dresultdt,
+                                     ccl_private float *alpha,
+                                     ccl_private float *dalphadx,
+                                     ccl_private float *dalphady,
+                                     ccl_private void *errormessage)
+{
+  if (!texture_handle) {
+    return false;
+  }
+
+  /* Only SVM textures are supported. */
+  int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1);
+
+  const float4 rgba = kernel_tex_image_interp_3d(nullptr, id, *P, INTERPOLATION_NONE);
+
+  result[0] = rgba.x;
+  if (nchannels > 1)
+    result[1] = rgba.y;
+  if (nchannels > 2)
+    result[2] = rgba.z;
+  if (nchannels > 3)
+    result[3] = rgba.w;
+
+  return true;
+}
+
+ccl_device_extern bool osl_environment(ccl_private ShaderGlobals *sg,
+                                       DeviceString filename,
+                                       ccl_private void *texture_handle,
+                                       OSLTextureOptions *opt,
+                                       ccl_private const float3 *R,
+                                       ccl_private const float3 *dRdx,
+                                       ccl_private const float3 *dRdy,
+                                       int nchannels,
+                                       ccl_private float *result,
+                                       ccl_private float *dresultds,
+                                       ccl_private float *dresultdt,
+                                       ccl_private float *alpha,
+                                       ccl_private float *dalphax,
+                                       ccl_private float *dalphay,
+                                       ccl_private void *errormessage)
+{
+  result[0] = 1.0f;
+  if (nchannels > 1)
+    result[1] = 0.0f;
+  if (nchannels > 2)
+    result[2] = 1.0f;
+  if (nchannels > 3)
+    result[3] = 1.0f;
+
+  return false;
+}
+
+ccl_device_extern bool osl_get_textureinfo(ccl_private ShaderGlobals *sg,
+                                           DeviceString filename,
+                                           ccl_private void *texture_handle,
+                                           DeviceString dataname,
+                                           int basetype,
+                                           int arraylen,
+                                           int aggegrate,
+                                           ccl_private void *data,
+                                           ccl_private void *errormessage)
+{
+  return false;
+}
+
+ccl_device_extern bool osl_get_textureinfo_st(ccl_private ShaderGlobals *sg,
+                                              DeviceString filename,
+                                              ccl_private void *texture_handle,
+                                              float s,
+                                              float t,
+                                              DeviceString dataname,
+                                              int basetype,
+                                              int arraylen,
+                                              int aggegrate,
+                                              ccl_private void *data,
+                                              ccl_private void *errormessage)
+{
+  return osl_get_textureinfo(
+      sg, filename, texture_handle, dataname, basetype, arraylen, aggegrate, data, errormessage);
+}
+
+/* Standard library */
+
+#define OSL_OP_IMPL_II(name, op) \
+  ccl_device_extern int name##_ii(int a) \
+  { \
+    return op(a); \
+  }
+#define OSL_OP_IMPL_IF(name, op) \
+  ccl_device_extern int name##_if(float a) \
+  { \
+    return op(a); \
+  }
+#define OSL_OP_IMPL_FF(name, op) \
+  ccl_device_extern float name##_ff(float a) \
+  { \
+    return op(a); \
+  }
+#define OSL_OP_IMPL_DFDF(name, op) \
+  ccl_device_extern void name##_dfdf(ccl_private float *res, ccl_private const float *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDV(name, op) \
+  ccl_device_extern void name##_dfdv(ccl_private float *res, ccl_private const float3 *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_FV(name, op) \
+  ccl_device_extern float name##_fv(ccl_private const float3 *a) \
+  { \
+    return op(*a); \
+  }
+#define OSL_OP_IMPL_VV(name, op) \
+  ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    *res = op(*a); \
+  }
+#define OSL_OP_IMPL_VV_(name, op) \
+  ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    res->x = op(a->x); \
+    res->y = op(a->y); \
+    res->z = op(a->z); \
+  }
+#define OSL_OP_IMPL_DVDV(name, op) \
+  ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDV_(name, op) \
+  ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x); \
+      res[i].y = op(a[i].y); \
+      res[i].z = op(a[i].z); \
+    } \
+  }
+
+#define OSL_OP_IMPL_III(name, op) \
+  ccl_device_extern int name##_iii(int a, int b) \
+  { \
+    return op(a, b); \
+  }
+#define OSL_OP_IMPL_FFF(name, op) \
+  ccl_device_extern float name##_fff(float a, float b) \
+  { \
+    return op(a, b); \
+  }
+#define OSL_OP_IMPL_FVV(name, op) \
+  ccl_device_extern float name##_fvv(ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    return op(*a, *b); \
+  }
+#define OSL_OP_IMPL_DFFDF(name, op) \
+  ccl_device_extern void name##_dffdf( \
+      ccl_private float *res, float a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFF(name, op) \
+  ccl_device_extern void name##_dfdff( \
+      ccl_private float *res, ccl_private const float *a, float b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFDF(name, op) \
+  ccl_device_extern void name##_dfdfdf( \
+      ccl_private float *res, ccl_private const float *a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFVDV(name, op) \
+  ccl_device_extern void name##_dfvdv( \
+      ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[0], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDVV(name, op) \
+  ccl_device_extern void name##_dfdvv( \
+      ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[0]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDVDV(name, op) \
+  ccl_device_extern void name##_dfdvdv( \
+      ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_VVF_(name, op) \
+  ccl_device_extern void name##_vvf( \
+      ccl_private float3 *res, ccl_private const float3 *a, float b) \
+  { \
+    res->x = op(a->x, b); \
+    res->y = op(a->y, b); \
+    res->z = op(a->z, b); \
+  }
+#define OSL_OP_IMPL_VVV(name, op) \
+  ccl_device_extern void name##_vvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    *res = op(*a, *b); \
+  }
+#define OSL_OP_IMPL_VVV_(name, op) \
+  ccl_device_extern void name##_vvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    res->x = op(a->x, b->x); \
+    res->y = op(a->y, b->y); \
+    res->z = op(a->z, b->z); \
+  }
+#define OSL_OP_IMPL_DVVDF_(name, op) \
+  ccl_device_extern void name##_dvvdf( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[0].x, b[i]); \
+      res[i].y = op(a[0].y, b[i]); \
+      res[i].z = op(a[0].z, b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVF_(name, op) \
+  ccl_device_extern void name##_dvdvf( \
+      ccl_private float3 *res, ccl_private const float3 *a, float b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b); \
+      res[i].y = op(a[i].y, b); \
+      res[i].z = op(a[i].z, b); \
+    } \
+  }
+#define OSL_OP_IMPL_DVVDV(name, op) \
+  ccl_device_extern void name##_dvvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[0], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVVDV_(name, op) \
+  ccl_device_extern void name##_dvvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[0].x, b[i].x); \
+      res[i].y = op(a[0].y, b[i].y); \
+      res[i].z = op(a[0].z, b[i].z); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVV(name, op) \
+  ccl_device_extern void name##_dvdvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[0]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVV_(name, op) \
+  ccl_device_extern void name##_dvdvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b[0].x); \
+      res[i].y = op(a[i].y, b[0].y); \
+      res[i].z = op(a[i].z, b[0].z); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVDF_(name, op) \
+  ccl_device_extern void name##_dvdvdf( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b[i]); \
+      res[i].y = op(a[i].y, b[i]); \
+      res[i].z = op(a[i].z, b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVDV(name, op) \
+  ccl_device_extern void name##_dvdvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVDV_(name, op) \
+  ccl_device_extern void name##_dvdvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b[i].x); \
+      res[i].y = op(a[i].y, b[i].y); \
+      res[i].z = op(a[i].z, b[i].z); \
+    } \
+  }
+
+#define OSL_OP_IMPL_FFFF(name, op) \
+  ccl_device_extern float name##_ffff(float a, float b, float c) \
+  { \
+    return op(a, b, c); \
+  }
+#define OSL_OP_IMPL_DFFFDF(name, op) \
+  ccl_device_extern void name##_dfffdf( \
+      ccl_private float *res, float a, float b, ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b, c[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFFDFF(name, op) \
+  ccl_device_extern void name##_dffdff( \
+      ccl_private float *res, float a, ccl_private const float *b, float c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b[i], c); \
+    } \
+  }
+#define OSL_OP_IMPL_DFFDFDF(name, op) \
+  ccl_device_extern void name##_dffdfdf( \
+      ccl_private float *res, float a, ccl_private const float *b, ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b[i], c[i]); \
+    } \
+  }
+
+#define OSL_OP_IMPL_DFDFFF(name, op) \
+  ccl_device_extern void name##_dfdfff( \
+      ccl_private float *res, ccl_private const float *a, float b, float c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b, c); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFFDF(name, op) \
+  ccl_device_extern void name##_dfdffdf( \
+      ccl_private float *res, ccl_private const float *a, float b, ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b, c[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFDFF(name, op) \
+  ccl_device_extern void name##_dfdfdff( \
+      ccl_private float *res, ccl_private const float *a, ccl_private const float *b, float c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i], c); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFDFDF(name, op) \
+  ccl_device_extern void name##_dfdfdfdf(ccl_private float *res, \
+                                         ccl_private const float *a, \
+                                         ccl_private const float *b, \
+                                         ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i], c[i]); \
+    } \
+  }
+
+#define OSL_OP_IMPL_XX(name, op) \
+  OSL_OP_IMPL_FF(name, op) \
+  OSL_OP_IMPL_DFDF(name, op) \
+  OSL_OP_IMPL_VV_(name, op) \
+  OSL_OP_IMPL_DVDV_(name, op)
+
+#define OSL_OP_IMPL_XXX(name, op) \
+  OSL_OP_IMPL_FFF(name, op) \
+  OSL_OP_IMPL_DFFDF(name, op) \
+  OSL_OP_IMPL_DFDFF(name, op) \
+  OSL_OP_IMPL_DFDFDF(name, op) \
+  OSL_OP_IMPL_VVV_(name, op) \
+  OSL_OP_IMPL_DVVDV_(name, op) \
+  OSL_OP_IMPL_DVDVV_(name, op) \
+  OSL_OP_IMPL_DVDVDV_(name, op)
+
+OSL_OP_IMPL_XX(osl_acos, acosf)
+OSL_OP_IMPL_XX(osl_asin, asinf)
+OSL_OP_IMPL_XX(osl_atan, atanf)
+OSL_OP_IMPL_XXX(osl_atan2, atan2f)
+OSL_OP_IMPL_XX(osl_cos, cosf)
+OSL_OP_IMPL_XX(osl_sin, sinf)
+OSL_OP_IMPL_XX(osl_tan, tanf)
+OSL_OP_IMPL_XX(osl_cosh, coshf)
+OSL_OP_IMPL_XX(osl_sinh, sinhf)
+OSL_OP_IMPL_XX(osl_tanh, tanhf)
+
+ccl_device_forceinline int safe_divide(int a, int b)
+{
+  return (b != 0) ? a / b : 0;
+}
+ccl_device_forceinline int safe_modulo(int a, int b)
+{
+  return (b != 0) ? a % b : 0;
+}
+
+OSL_OP_IMPL_III(osl_safe_div, safe_divide)
+OSL_OP_IMPL_FFF(osl_safe_div, safe_divide)
+OSL_OP_IMPL_III(osl_safe_mod, safe_modulo)
+
+ccl_device_extern void osl_sincos_fff(float a, ccl_private float *b, ccl_private float *c)
+{
+  sincos(a, b, c);
+}
+ccl_device_extern void osl_sincos_dfdff(ccl_private const float *a,
+                                        ccl_private float *b,
+                                        ccl_private float *c)
+{
+  for (int i = 0; i < 3; ++i)
+    sincos(a[i], b + i, c);
+}
+ccl_device_extern void osl_sincos_dffdf(ccl_private const float *a,
+                                        ccl_private float *b,
+                                        ccl_private float *c)
+{
+  for (int i = 0; i < 3; ++i)
+    sincos(a[i], b, c + i);
+}
+ccl_device_extern void osl_sincos_dfdfdf(ccl_private const float *a,
+                                         ccl_private float *b,
+                                         ccl_private float *c)
+{
+  for (int i = 0; i < 3; ++i)
+    sincos(a[i], b + i, c + i);
+}
+ccl_device_extern void osl_sincos_vvv(ccl_private const float3 *a,
+                                      ccl_private float3 *b,
+                                      ccl_private float3 *c)
+{
+  sincos(a->x, &b->x, &c->x);
+  sincos(a->y, &b->y, &c->y);
+  sincos(a->z, &b->z, &c->z);
+}
+ccl_device_extern void osl_sincos_dvdvv(ccl_private const float3 *a,
+                                        ccl_private float3 *b,
+                                        ccl_private float3 *c)
+{
+  for (int i = 0; i < 3; ++i) {
+    sincos(a[i].x, &b[i].x, &c->x);
+    sincos(a[i].y, &b[i].y, &c->y);
+    sincos(a[i].z, &b[i].z, &c->z);
+  }
+}
+ccl_device_extern void osl_sincos_dvvdv(ccl_private const float3 *a,
+                                        ccl_private float3 *b,
+                                        ccl_private float3 *c)
+{
+  for (int i = 0; i < 3; ++i) {
+    sincos(a[i].x, &b->x, &c[i].x);
+    sincos(a[i].y, &b->y, &c[i].y);
+    sincos(a[i].z, &b->z, &c[i].z);
+  }
+}
+ccl_device_extern void osl_sincos_dvdvdv(ccl_private const float3 *a,
+                                         ccl_private float3 *b,
+                                         ccl_private float3 *c)
+{
+  for (int i = 0; i < 3; ++i) {
+    sincos(a[i].x, &b[i].x, &c[i].x);
+    sincos(a[i].y, &b[i].y, &c[i].y);
+    sincos(a[i].z, &b[i].z, &c[i].z);
+  }
+}
+
+OSL_OP_IMPL_XX(osl_log, logf)
+OSL_OP_IMPL_XX(osl_log2, log2f)
+OSL_OP_IMPL_XX(osl_log10, log10f)
+OSL_OP_IMPL_XX(osl_exp, expf)
+OSL_OP_IMPL_XX(osl_exp2, exp2f)
+OSL_OP_IMPL_XX(osl_expm1, expm1f)
+OSL_OP_IMPL_XX(osl_erf, erff)
+OSL_OP_IMPL_XX(osl_erfc, erfcf)
+
+OSL_OP_IMPL_XXX(osl_pow, safe_powf)
+OSL_OP_IMPL_VVF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVVDF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVDVF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVDVDF_(osl_pow, safe_powf)
+
+OSL_OP_IMPL_XX(osl_sqrt, sqrtf)
+OSL_OP_IMPL_XX(osl_inversesqrt, 1.0f / sqrtf)
+OSL_OP_IMPL_XX(osl_cbrt, cbrtf)
+
+OSL_OP_IMPL_FF(osl_logb, logbf)
+OSL_OP_IMPL_VV_(osl_logb, logbf)
+
+OSL_OP_IMPL_FF(osl_floor, floorf)
+OSL_OP_IMPL_VV_(osl_floor, floorf)
+OSL_OP_IMPL_FF(osl_ceil, ceilf)
+OSL_OP_IMPL_VV_(osl_ceil, ceilf)
+OSL_OP_IMPL_FF(osl_round, roundf)
+OSL_OP_IMPL_VV_(osl_round, roundf)
+OSL_OP_IMPL_FF(osl_trunc, truncf)
+OSL_OP_IMPL_VV_(osl_trunc, truncf)
+
+ccl_device_forceinline float step_impl(float edge, float x)
+{
+  return x < edge ? 0.0f : 1.0f;
+}
+
+OSL_OP_IMPL_FF(osl_sign, compatible_signf)
+OSL_OP_IMPL_VV_(osl_sign, compatible_signf)
+OSL_OP_IMPL_FFF(osl_step, step_impl)
+OSL_OP_IMPL_VVV_(osl_step, step_impl)
+
+OSL_OP_IMPL_IF(osl_isnan, isnan)
+OSL_OP_IMPL_IF(osl_isinf, isinf)
+OSL_OP_IMPL_IF(osl_isfinite, isfinite)
+
+OSL_OP_IMPL_II(osl_abs, abs)
+OSL_OP_IMPL_XX(osl_abs, fabsf)
+OSL_OP_IMPL_II(osl_fabs, abs)
+OSL_OP_IMPL_XX(osl_fabs, fabsf)
+OSL_OP_IMPL_XXX(osl_fmod, safe_modulo)
+
+OSL_OP_IMPL_FFFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFDFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFDFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFDFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFDFDF(osl_smoothstep, smoothstep)
+
+OSL_OP_IMPL_FVV(osl_dot, dot)
+OSL_OP_IMPL_DFDVV(osl_dot, dot)
+OSL_OP_IMPL_DFVDV(osl_dot, dot)
+OSL_OP_IMPL_DFDVDV(osl_dot, dot)
+OSL_OP_IMPL_VVV(osl_cross, cross)
+OSL_OP_IMPL_DVDVV(osl_cross, cross)
+OSL_OP_IMPL_DVVDV(osl_cross, cross)
+OSL_OP_IMPL_DVDVDV(osl_cross, cross)
+OSL_OP_IMPL_FV(osl_length, len)
+OSL_OP_IMPL_DFDV(osl_length, len)
+OSL_OP_IMPL_FVV(osl_distance, distance)
+OSL_OP_IMPL_DFDVV(osl_distance, distance)
+OSL_OP_IMPL_DFVDV(osl_distance, distance)
+OSL_OP_IMPL_DFDVDV(osl_distance, distance)
+OSL_OP_IMPL_VV(osl_normalize, safe_normalize)
+OSL_OP_IMPL_DVDV(osl_normalize, safe_normalize)
+
+ccl_device_extern void osl_calculatenormal(ccl_private float3 *res,
+                                           ccl_private ShaderGlobals *sg,
+                                           ccl_private const float3 *p)
+{
+  if (sg->flipHandedness)
+    *res = cross(p[2], p[1]);
+  else
+    *res = cross(p[1], p[2]);
+}
+
+ccl_device_extern float osl_area(ccl_private const float3 *p)
+{
+  return len(cross(p[2], p[1]));
+}
+
+ccl_device_extern float osl_filterwidth_fdf(ccl_private const float *x)
+{
+  return sqrtf(x[1] * x[1] + x[2] * x[2]);
+}
+
+ccl_device_extern void osl_filterwidth_vdv(ccl_private float *res, ccl_private const float *x)
+{
+  for (int i = 0; i < 3; ++i)
+    res[i] = osl_filterwidth_fdf(x + i);
+}
+
+ccl_device_extern bool osl_raytype_bit(ccl_private ShaderGlobals *sg, int bit)
+{
+  return (sg->raytype & bit) != 0;
+}
diff --git a/intern/cycles/kernel/osl/services_optix.cu b/intern/cycles/kernel/osl/services_optix.cu
new file mode 100644
index 00000000000..2a43a89a956
--- /dev/null
+++ b/intern/cycles/kernel/osl/services_optix.cu
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+// clang-format off
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
+
+#include "kernel/osl/services_gpu.h"
+// clang-format on
+
+extern "C" __device__ void __direct_callable__dummy_services()
+{
+}
diff --git a/intern/cycles/kernel/osl/types.h b/intern/cycles/kernel/osl/types.h
index 46e06114360..717306a3d07 100644
--- a/intern/cycles/kernel/osl/types.h
+++ b/intern/cycles/kernel/osl/types.h
@@ -5,9 +5,53 @@
 
 CCL_NAMESPACE_BEGIN
 
+struct DeviceString {
+#if defined(__KERNEL_GPU__)
+  /* Strings are represented by their hashes in CUDA and OptiX. */
+  size_t str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_;
+  }
+#elif defined(OPENIMAGEIO_USTRING_H)
+  ustring str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_.hash();
+  }
+#else
+  const char *str_;
+#endif
+
+  ccl_device_inline_method bool operator==(DeviceString b) const
+  {
+    return str_ == b.str_;
+  }
+  ccl_device_inline_method bool operator!=(DeviceString b) const
+  {
+    return str_ != b.str_;
+  }
+};
+
+ccl_device_inline DeviceString make_string(const char *str, size_t hash)
+{
+#if defined(__KERNEL_GPU__)
+  (void)str;
+  return {hash};
+#elif defined(OPENIMAGEIO_USTRING_H)
+  (void)hash;
+  return {ustring(str)};
+#else
+  (void)hash;
+  return {str};
+#endif
+}
+
 /* Closure */
 
-enum ClosureTypeOSL {
+enum OSLClosureType {
   OSL_CLOSURE_MUL_ID = -1,
   OSL_CLOSURE_ADD_ID = -2,
 
@@ -17,4 +61,60 @@ enum ClosureTypeOSL {
 #include "closures_template.h"
 };
 
+struct OSLClosure {
+  OSLClosureType id;
+};
+
+struct ccl_align(8) OSLClosureMul : public OSLClosure
+{
+  packed_float3 weight;
+  ccl_private const OSLClosure *closure;
+};
+
+struct ccl_align(8) OSLClosureAdd : public OSLClosure
+{
+  ccl_private const OSLClosure *closureA;
+  ccl_private const OSLClosure *closureB;
+};
+
+struct ccl_align(8) OSLClosureComponent : public OSLClosure
+{
+  packed_float3 weight;
+};
+
+/* Globals */
+
+struct ShaderGlobals {
+  packed_float3 P, dPdx, dPdy;
+  packed_float3 dPdz;
+  packed_float3 I, dIdx, dIdy;
+  packed_float3 N;
+  packed_float3 Ng;
+  float u, dudx, dudy;
+  float v, dvdx, dvdy;
+  packed_float3 dPdu, dPdv;
+  float time;
+  float dtime;
+  packed_float3 dPdtime;
+  packed_float3 Ps, dPsdx, dPsdy;
+  ccl_private void *renderstate;
+  ccl_private void *tracedata;
+  ccl_private void *objdata;
+  void *context;
+  void *renderer;
+  ccl_private void *object2common;
+  ccl_private void *shader2common;
+  ccl_private OSLClosure *Ci;
+  float surfacearea;
+  int raytype;
+  int flipHandedness;
+  int backfacing;
+};
+
+struct OSLNoiseOptions {
+};
+
+struct OSLTextureOptions {
+};
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 24c5a6a4540..a6f8914a9b8 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -75,10 +75,14 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__
 
 /* Device specific features */
-#ifndef __KERNEL_GPU__
-#  ifdef WITH_OSL
-#    define __OSL__
+#ifdef WITH_OSL
+#  define __OSL__
+#  ifdef __KERNEL_OPTIX__
+/* Kernels with OSL support are built separately in OptiX and don't need SVM. */
+#    undef __SVM__
 #  endif
+#endif
+#ifndef __KERNEL_GPU__
 #  ifdef WITH_PATH_GUIDING
 #    define __PATH_GUIDING__
 #  endif
@@ -917,9 +921,13 @@ typedef struct ccl_align(16) ShaderData
   float ray_dP;
 
 #ifdef __OSL__
+#  ifdef __KERNEL_GPU__
+  ccl_private uint8_t *osl_closure_pool;
+#  else
   const struct KernelGlobalsCPU *osl_globals;
   const struct IntegratorStateCPU *osl_path_state;
   const struct IntegratorShadowStateCPU *osl_shadow_path_state;
+#  endif
 #endif
 
   /* LCG state for closures that require additional random numbers. */
@@ -1529,6 +1537,9 @@ enum KernelFeatureFlag : uint32_t {
 
   /* Path guiding. */
   KERNEL_FEATURE_PATH_GUIDING = (1U << 26U),
+
+  /* OSL. */
+  KERNEL_FEATURE_OSL = (1U << 27U),
 };
 
 /* Shader node feature mask, to specialize shader evaluation for kernels. */
diff --git a/intern/cycles/scene/osl.cpp b/intern/cycles/scene/osl.cpp
index 93839facdbe..3ea406b6935 100644
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -38,16 +38,17 @@ OSL::TextureSystem *OSLShaderManager::ts_shared = NULL;
 int OSLShaderManager::ts_shared_users = 0;
 thread_mutex OSLShaderManager::ts_shared_mutex;
 
-OSL::ShadingSystem *OSLShaderManager::ss_shared = NULL;
-OSLRenderServices *OSLShaderManager::services_shared = NULL;
+OSL::ErrorHandler OSLShaderManager::errhandler;
+map<int, OSL::ShadingSystem *> OSLShaderManager::ss_shared;
 int OSLShaderManager::ss_shared_users = 0;
 thread_mutex OSLShaderManager::ss_shared_mutex;
 thread_mutex OSLShaderManager::ss_mutex;
+
 int OSLCompiler::texture_shared_unique_id = 0;
 
 /* Shader Manager */
 
-OSLShaderManager::OSLShaderManager()
+OSLShaderManager::OSLShaderManager(Device *device) : device_(device)
 {
   texture_system_init();
   shading_system_init();
@@ -107,11 +108,12 @@ void OSLShaderManager::device_update_specific(Device *device,
 
   device_free(device, dscene, scene);
 
-  /* set texture system */
-  scene->image_manager->set_osl_texture_system((void *)ts);
+  /* set texture system (only on CPU devices, since GPU devices cannot use OIIO) */
+  if (device->info.type == DEVICE_CPU) {
+    scene->image_manager->set_osl_texture_system((void *)ts_shared);
+  }
 
   /* create shaders */
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
   Shader *background_shader = scene->background->get_shader(scene);
 
   foreach (Shader *shader, scene->shaders) {
@@ -125,22 +127,34 @@ void OSLShaderManager::device_update_specific(Device *device,
      * compile shaders alternating */
     thread_scoped_lock lock(ss_mutex);
 
-    OSLCompiler compiler(this, services, ss, scene);
-    compiler.background = (shader == background_shader);
-    compiler.compile(og, shader);
+    device->foreach_device(
+        [this, scene, shader, background = (shader == background_shader)](Device *sub_device) {
+          OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+          OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+          OSLCompiler compiler(this, ss, scene);
+          compiler.background = background;
+          compiler.compile(og, shader);
+        });
 
     if (shader->get_use_mis() && shader->has_surface_emission)
       scene->light_manager->tag_update(scene, LightManager::SHADER_COMPILED);
   }
 
   /* setup shader engine */
-  og->ss = ss;
-  og->ts = ts;
-  og->services = services;
-
   int background_id = scene->shader_manager->get_shader_id(background_shader);
-  og->background_state = og->surface_state[background_id & SHADER_MASK];
-  og->use = true;
+
+  device->foreach_device([background_id](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+    og->ss = ss;
+    og->ts = ts_shared;
+    og->services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    og->background_state = og->surface_state[background_id & SHADER_MASK];
+    og->use = true;
+  });
 
   foreach (Shader *shader, scene->shaders)
     shader->clear_modified();
@@ -148,8 +162,12 @@ void OSLShaderManager::device_update_specific(Device *device,
   update_flags = UPDATE_NONE;
 
   /* add special builtin texture types */
-  services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
-  services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  for (const auto &[device_type, ss] : ss_shared) {
+    OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
+    services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  }
 
   device_update_common(device, dscene, scene, progress);
 
@@ -166,26 +184,35 @@ void OSLShaderManager::device_update_specific(Device *device,
      * is being freed after the Session is freed.
      */
     thread_scoped_lock lock(ss_shared_mutex);
-    ss->optimize_all_groups();
+    for (const auto &[device_type, ss] : ss_shared) {
+      ss->optimize_all_groups();
+    }
+  }
+
+  /* load kernels */
+  if (!device->load_osl_kernels()) {
+    progress.set_error(device->error_message());
   }
 }
 
 void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
 {
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
-
   device_free_common(device, dscene, scene);
 
   /* clear shader engine */
-  og->use = false;
-  og->ss = NULL;
-  og->ts = NULL;
-
-  og->surface_state.clear();
-  og->volume_state.clear();
-  og->displacement_state.clear();
-  og->bump_state.clear();
-  og->background_state.reset();
+  device->foreach_device([](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+
+    og->use = false;
+    og->ss = NULL;
+    og->ts = NULL;
+
+    og->surface_state.clear();
+    og->volume_state.clear();
+    og->displacement_state.clear();
+    og->bump_state.clear();
+    og->background_state.reset();
+  });
 }
 
 void OSLShaderManager::texture_system_init()
@@ -193,7 +220,7 @@ void OSLShaderManager::texture_system_init()
   /* create texture system, shared between different renders to reduce memory usage */
   thread_scoped_lock lock(ts_shared_mutex);
 
-  if (ts_shared_users == 0) {
+  if (ts_shared_users++ == 0) {
     ts_shared = TextureSystem::create(true);
 
     ts_shared->attribute("automip", 1);
@@ -203,24 +230,18 @@ void OSLShaderManager::texture_system_init()
     /* effectively unlimited for now, until we support proper mipmap lookups */
     ts_shared->attribute("max_memory_MB", 16384);
   }
-
-  ts = ts_shared;
-  ts_shared_users++;
 }
 
 void OSLShaderManager::texture_system_free()
 {
   /* shared texture system decrease users and destroy if no longer used */
   thread_scoped_lock lock(ts_shared_mutex);
-  ts_shared_users--;
 
-  if (ts_shared_users == 0) {
+  if (--ts_shared_users == 0) {
     ts_shared->invalidate_all(true);
     OSL::TextureSystem::destroy(ts_shared);
     ts_shared = NULL;
   }
-
-  ts = NULL;
 }
 
 void OSLShaderManager::shading_system_init()
@@ -228,101 +249,105 @@ void OSLShaderManager::shading_system_init()
   /* create shading system, shared between different renders to reduce memory usage */
   thread_scoped_lock lock(ss_shared_mutex);
 
-  if (ss_shared_users == 0) {
-    /* Must use aligned new due to concurrent hash map. */
-    services_shared = util_aligned_new<OSLRenderServices>(ts_shared);
+  device_->foreach_device([](Device *sub_device) {
+    const DeviceType device_type = sub_device->info.type;
 
-    string shader_path = path_get("shader");
+    if (ss_shared_users++ == 0 || ss_shared.find(device_type) == ss_shared.end()) {
+      /* Must use aligned new due to concurrent hash map. */
+      OSLRenderServices *services = util_aligned_new<OSLRenderServices>(ts_shared, device_type);
+
+      string shader_path = path_get("shader");
 #  ifdef _WIN32
-    /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
-     * operate with file paths with any character. This requires to use wide
-     * char functions, but OSL uses old fashioned ANSI functions which means:
-     *
-     * - We have to convert our paths to ANSI before passing to OSL
-     * - OSL can't be used when there's a multi-byte character in the path
-     *   to the shaders folder.
-     */
-    shader_path = string_to_ansi(shader_path);
+      /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
+       * operate with file paths with any character. This requires to use wide
+       * char functions, but OSL uses old fashioned ANSI functions which means:
+       *
+       * - We have to convert our paths to ANSI before passing to OSL
+       * - OSL can't be used when there's a multi-byte character in the path
+       *   to the shaders folder.
+       */
+      shader_path = string_to_ansi(shader_path);
 #  endif
 
-    ss_shared = new OSL::ShadingSystem(services_shared, ts_shared, &errhandler);
-    ss_shared->attribute("lockgeom", 1);
-    ss_shared->attribute("commonspace", "world");
-    ss_shared->attribute("searchpath:shader", shader_path);
-    ss_shared->attribute("greedyjit", 1);
-
-    VLOG_INFO << "Using shader search path: " << shader_path;
-
-    /* our own ray types */
-    static const char *raytypes[] = {
-        "camera",         /* PATH_RAY_CAMERA */
-        "reflection",     /* PATH_RAY_REFLECT */
-        "refraction",     /* PATH_RAY_TRANSMIT */
-        "diffuse",        /* PATH_RAY_DIFFUSE */
-        "glossy",         /* PATH_RAY_GLOSSY */
-        "singular",       /* PATH_RAY_SINGULAR */
-        "transparent",    /* PATH_RAY_TRANSPARENT */
-        "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
-
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
-
-        "__unused__", /* PATH_RAY_NODE_UNALIGNED */
-        "__unused__", /* PATH_RAY_MIS_SKIP */
-
-        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-
-        /* Remaining irrelevant bits up to 32. */
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-    };
-
-    const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
-    ss_shared->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
-
-    OSLRenderServices::register_closures(ss_shared);
-
-    loaded_shaders.clear();
-  }
+      OSL::ShadingSystem *ss = new OSL::ShadingSystem(services, ts_shared, &errhandler);
+      ss->attribute("lockgeom", 1);
+      ss->attribute("commonspace", "world");
+      ss->attribute("searchpath:shader", shader_path);
+      ss->attribute("greedyjit", 1);
+
+      VLOG_INFO << "Using shader search path: " << shader_path;
+
+      /* our own ray types */
+      static const char *raytypes[] = {
+          "camera",         /* PATH_RAY_CAMERA */
+          "reflection",     /* PATH_RAY_REFLECT */
+          "refraction",     /* PATH_RAY_TRANSMIT */
+          "diffuse",        /* PATH_RAY_DIFFUSE */
+          "glossy",         /* PATH_RAY_GLOSSY */
+          "singular",       /* PATH_RAY_SINGULAR */
+          "transparent",    /* PATH_RAY_TRANSPARENT */
+          "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+
+          "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+          "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+
+          "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+          "__unused__", /* PATH_RAY_MIS_SKIP */
+
+          "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+
+          /* Remaining irrelevant bits up to 32. */
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+      };
+
+      const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
+      ss->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
+
+      OSLRenderServices::register_closures(ss);
+
+      ss_shared[device_type] = ss;
+    }
+  });
 
-  ss = ss_shared;
-  services = services_shared;
-  ss_shared_users++;
+  loaded_shaders.clear();
 }
 
 void OSLShaderManager::shading_system_free()
 {
   /* shared shading system decrease users and destroy if no longer used */
   thread_scoped_lock lock(ss_shared_mutex);
-  ss_shared_users--;
 
-  if (ss_shared_users == 0) {
-    delete ss_shared;
-    ss_shared = NULL;
+  device_->foreach_device([](Device * /*sub_device*/) {
+    if (--ss_shared_users == 0) {
+      for (const auto &[device_type, ss] : ss_shared) {
+        OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
 
-    util_aligned_delete(services_shared);
-    services_shared = NULL;
-  }
+        delete ss;
+
+        util_aligned_delete(services);
+      }
 
-  ss = NULL;
-  services = NULL;
+      ss_shared.clear();
+    }
+  });
 }
 
 bool OSLShaderManager::osl_compile(const string &inputfile, const string &outputfile)
@@ -447,7 +472,9 @@ const char *OSLShaderManager::shader_load_filepath(string filepath)
 
 const char *OSLShaderManager::shader_load_bytecode(const string &hash, const string &bytecode)
 {
-  ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  for (const auto &[device_type, ss] : ss_shared) {
+    ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  }
 
   OSLShaderInfo info;
 
@@ -599,11 +626,11 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,
 
 /* Graph Compiler */
 
-OSLCompiler::OSLCompiler(OSLShaderManager *manager,
-                         OSLRenderServices *services,
-                         OSL::ShadingSystem *ss,
-                         Scene *scene)
-    : scene(scene), manager(manager), services(services), ss(ss)
+OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
+    : scene(scene),
+      manager(manager),
+      services(static_cast<OSLRenderServices *>(ss->renderer())),
+      ss(ss)
 {
   current_type = SHADER_TYPE_SURFACE;
   current_shader = NULL;
@@ -1105,7 +1132,12 @@ OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph
 {
   current_type = type;
 
-  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(shader->name.c_str());
+  string name = shader->name.string();
+  /* Replace invalid characters. */
+  for (size_t i; (i = name.find_first_of(" .,:;+-*/#")) != string::npos;)
+    name.replace(i, 1, "_");
+
+  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(name);
 
   ShaderNode *output = graph->output();
   ShaderNodeSet dependencies;
diff --git a/intern/cycles/scene/osl.h b/intern/cycles/scene/osl.h
index 76c6bd96ce1..c0e82a9dc8d 100644
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -54,7 +54,7 @@ struct OSLShaderInfo {
 
 class OSLShaderManager : public ShaderManager {
  public:
-  OSLShaderManager();
+  OSLShaderManager(Device *device);
   ~OSLShaderManager();
 
   static void free_memory();
@@ -92,25 +92,22 @@ class OSLShaderManager : public ShaderManager {
                            const std::string &bytecode_hash = "",
                            const std::string &bytecode = "");
 
- protected:
+ private:
   void texture_system_init();
   void texture_system_free();
 
   void shading_system_init();
   void shading_system_free();
 
-  OSL::ShadingSystem *ss;
-  OSL::TextureSystem *ts;
-  OSLRenderServices *services;
-  OSL::ErrorHandler errhandler;
+  Device *device_;
   map<string, OSLShaderInfo> loaded_shaders;
 
   static OSL::TextureSystem *ts_shared;
   static thread_mutex ts_shared_mutex;
   static int ts_shared_users;
 
-  static OSL::ShadingSystem *ss_shared;
-  static OSLRenderServices *services_shared;
+  static OSL::ErrorHandler errhandler;
+  static map<int, OSL::ShadingSystem *> ss_shared;
   static thread_mutex ss_shared_mutex;
   static thread_mutex ss_mutex;
   static int ss_shared_users;
@@ -123,10 +120,7 @@ class OSLShaderManager : public ShaderManager {
 class OSLCompiler {
  public:
 #ifdef WITH_OSL
-  OSLCompiler(OSLShaderManager *manager,
-              OSLRenderServices *services,
-              OSL::ShadingSystem *shadingsys,
-              Scene *scene);
+  OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *shadingsys, Scene *scene);
 #endif
   void compile(OSLGlobals *og, Shader *shader);
 
diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp
index 3a05bede7a3..d5be86e1db9 100644
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -99,11 +99,8 @@ Scene::Scene(const SceneParams &params_, Device *device)
 {
   memset((void *)&dscene.data, 0, sizeof(dscene.data));
 
-  /* OSL only works on the CPU */
-  if (device->info.has_osl)
-    shader_manager = ShaderManager::create(params.shadingsystem);
-  else
-    shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM);
+  shader_manager = ShaderManager::create(
+      device->info.has_osl ? params.shadingsystem : SHADINGSYSTEM_SVM, device);
 
   light_manager = new LightManager();
   geometry_manager = new GeometryManager();
diff --git a/intern/cycles/scene/shader.cpp b/intern/cycles/scene/shader.cpp
index 56670c6e4e3..f176c19ec95 100644
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@@ -395,15 +395,16 @@ ShaderManager::~ShaderManager()
 {
 }
 
-ShaderManager *ShaderManager::create(int shadingsystem)
+ShaderManager *ShaderManager::create(int shadingsystem, Device *device)
 {
   ShaderManager *manager;
 
   (void)shadingsystem; /* Ignored when built without OSL. */
+  (void)device;
 
 #ifdef WITH_OSL
   if (shadingsystem == SHADINGSYSTEM_OSL) {
-    manager = new OSLShaderManager();
+    manager = new OSLShaderManager(device);
   }
   else
 #endif
@@ -722,6 +723,10 @@ uint ShaderManager::get_kernel_features(Scene *scene)
     }
   }
 
+  if (use_osl()) {
+    kernel_features |= KERNEL_FEATURE_OSL;
+  }
+
   return kernel_features;
 }
 
diff --git a/intern/cycles/scene/shader.h b/intern/cycles/scene/shader.h
index 2670776aca4..69b22d2ad19 100644
--- a/intern/cycles/scene/shader.h
+++ b/intern/cycles/scene/shader.h
@@ -170,7 +170,7 @@ class ShaderManager {
     UPDATE_NONE = 0u,
   };
 
-  static ShaderManager *create(int shadingsystem);
+  static ShaderManager *create(int shadingsystem, Device *device);
   virtual ~ShaderManager();
 
   virtual void reset(Scene *scene) = 0;
diff --git a/intern/cycles/scene/shader_nodes.h b/intern/cycles/scene/shader_nodes.h
index cc3a71a0697..a3a931bb0b3 100644
--- a/intern/cycles/scene/shader_nodes.h
+++ b/intern/cycles/scene/shader_nodes.h
@@ -1542,6 +1542,10 @@ class OSLNode final : public ShaderNode {
   {
     return true;
   }
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_RAYTRACE;
+  }
 
   virtual bool equals(const ShaderNode & /*other*/)
   {
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 1969529eff0..d5be14c8eba 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -23,6 +23,7 @@
 /* Leave inlining decisions to compiler for these, the inline keyword here
  * is not about performance but including function definitions in headers. */
 #  define ccl_device static inline
+#  define ccl_device_extern extern "C"
 #  define ccl_device_noinline static inline
 #  define ccl_device_noinline_cpu ccl_device_noinline
 
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index d7f95b7f296..0c39901a63c 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -196,14 +196,7 @@ ccl_device_inline Transform make_transform_frame(float3 N)
   return make_transform(dx.x, dx.y, dx.z, 0.0f, dy.x, dy.y, dy.z, 0.0f, N.x, N.y, N.z, 0.0f);
 }
 
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline Transform transform_zero()
-{
-  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
-  return zero;
-}
-
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline Transform operator*(const Transform a, const Transform b)
 {
   float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f);
@@ -218,6 +211,15 @@ ccl_device_inline Transform operator*(const Transform a, const Transform b)
 
   return t;
 }
+#endif
+
+#ifndef __KERNEL_GPU__
+
+ccl_device_inline Transform transform_zero()
+{
+  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
+  return zero;
+}
 
 ccl_device_inline void print_transform(const char *label, const Transform &t)
 {