Cycles: Add basic support for using OSL with OptiX

This patch generalizes the OSL support in Cycles to include GPU device types and adds an implementation for that in the OptiX device. There are some caveats still, including simplified texturing due to lack of OIIO on the GPU and a few missing OSL intrinsics. Note that this is incomplete and missing an update to the OSL library before being enabled! The implementation is already committed now to simplify further development. Maniphest Tasks: T101222 Differential Revision: https://developer.blender.org/D15902
author: Patrick Mours <pmours@nvidia.com> 2022-11-09 16:25:32 +0300
committer: Patrick Mours <pmours@nvidia.com> 2022-11-09 17:30:21 +0300
commit: e6b38deb9dbb58118f6ee644409ce52f06eac5e5 (patch)
tree: aa1c384db146094482f24c94f704742c6624db00
parent: efe073f57c34b438d21750795e97458a3d007be7 (diff)
43 files changed, 3478 insertions, 564 deletions
diff --git a/build_files/cmake/platform/platform_win32.cmake b/build_files/cmake/platform/platform_win32.cmake
index 7a2d3ad948a..47673794652 100644
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -419,7 +419,7 @@ if(WITH_IMAGE_OPENEXR)
     warn_hardcoded_paths(OpenEXR)
     set(OPENEXR ${LIBDIR}/openexr)
     set(OPENEXR_INCLUDE_DIR ${OPENEXR}/include)
-    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR}/include/OpenEXR)
+    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR_INCLUDE_DIR}/OpenEXR)
     set(OPENEXR_LIBPATH ${OPENEXR}/lib)
     # Check if the 3.x library name exists
     # if not assume this is a 2.x library folder
@@ -568,7 +568,8 @@ if(WITH_OPENIMAGEIO)
   if(NOT OpenImageIO_FOUND)
     set(OPENIMAGEIO ${LIBDIR}/OpenImageIO)
     set(OPENIMAGEIO_LIBPATH ${OPENIMAGEIO}/lib)
-    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR})
     set(OIIO_OPTIMIZED optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO.lib optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util.lib)
     set(OIIO_DEBUG debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_d.lib debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util_d.lib)
     set(OPENIMAGEIO_LIBRARIES ${OIIO_OPTIMIZED} ${OIIO_DEBUG})
@@ -785,6 +786,14 @@ if(WITH_CYCLES AND WITH_CYCLES_OSL)
   endif()
   find_path(OSL_INCLUDE_DIR OSL/oslclosure.h PATHS ${CYCLES_OSL}/include)
   find_program(OSL_COMPILER NAMES oslc PATHS ${CYCLES_OSL}/bin)
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MAJOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+[0-9]+.*$")
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MINOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+[0-9]+.*$")
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MAJOR ${OSL_LIBRARY_VERSION_MAJOR})
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MINOR ${OSL_LIBRARY_VERSION_MINOR})
 endif()
 
 if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 05f27bdbd4d..354c9c23a53 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -58,7 +58,7 @@ class CyclesRender(bpy.types.RenderEngine):
         if not self.session:
             if self.is_preview:
                 cscene = bpy.context.scene.cycles
-                use_osl = cscene.shading_system and cscene.device == 'CPU'
+                use_osl = cscene.shading_system
 
                 engine.create(self, data, preview_osl=use_osl)
             else:
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index e33891fa7a2..83dc6332f47 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -155,6 +155,10 @@ def with_osl():
     import _cycles
     return _cycles.with_osl
 
+def osl_version():
+    import _cycles
+    return _cycles.osl_version
+
 
 def with_path_guiding():
     import _cycles
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index f5cd88f6b6a..9d7c71417f2 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -290,7 +290,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     )
     shading_system: BoolProperty(
         name="Open Shading Language",
-        description="Use Open Shading Language (CPU rendering only)",
+        description="Use Open Shading Language",
     )
 
     preview_pause: BoolProperty(
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 305accc8f1a..11fa2bc62fb 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -2305,7 +2305,7 @@ def draw_device(self, context):
         col.prop(cscene, "device")
 
         from . import engine
-        if engine.with_osl() and use_cpu(context):
+        if engine.with_osl() and (use_cpu(context) or (use_optix(context) and (engine.osl_version()[1] >= 13 or engine.osl_version()[0] > 1))):
             col.prop(cscene, "shading_system")
 
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 2e4d18241cf..06a2f5c7b01 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -160,6 +160,11 @@ class Device {
     return true;
   }
 
+  virtual bool load_osl_kernels()
+  {
+    return true;
+  }
+
   /* GPU device only functions.
    * These may not be used on CPU or multi-devices. */
 
diff --git a/intern/cycles/device/kernel.cpp b/intern/cycles/device/kernel.cpp
index 96a99cd62cd..27ca0d81817 100644
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -7,6 +7,30 @@
 
 CCL_NAMESPACE_BEGIN
 
+bool device_kernel_has_shading(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY);
+}
+
+bool device_kernel_has_intersection(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
+}
+
 const char *device_kernel_as_string(DeviceKernel kernel)
 {
   switch (kernel) {
diff --git a/intern/cycles/device/kernel.h b/intern/cycles/device/kernel.h
index 4ae461f1f67..b829a891260 100644
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -11,6 +11,9 @@
 
 CCL_NAMESPACE_BEGIN
 
+bool device_kernel_has_shading(DeviceKernel kernel);
+bool device_kernel_has_intersection(DeviceKernel kernel);
+
 const char *device_kernel_as_string(DeviceKernel kernel);
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
 
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
index 6904d2c2dc6..9605c6a7538 100644
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -138,6 +138,15 @@ class MultiDevice : public Device {
     return true;
   }
 
+  bool load_osl_kernels() override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_osl_kernels())
+        return false;
+
+    return true;
+  }
+
   void build_bvh(BVH *bvh, Progress &progress, bool refit) override
   {
     /* Try to build and share a single acceleration structure, if possible */
@@ -204,10 +213,12 @@ class MultiDevice : public Device {
 
   virtual void *get_cpu_osl_memory() override
   {
-    if (devices.size() > 1) {
+    /* Always return the OSL memory of the CPU device (this works since the constructor above
+     * guarantees that CPU devices are always added to the back). */
+    if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) {
       return NULL;
     }
-    return devices.front().device->get_cpu_osl_memory();
+    return devices.back().device->get_cpu_osl_memory();
   }
 
   bool is_resident(device_ptr key, Device *sub_device) override
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
index 68ca21374fd..58b72374a7d 100644
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@@ -9,6 +9,10 @@
 
 #include "util/log.h"
 
+#ifdef WITH_OSL
+#  include <OSL/oslversion.h>
+#endif
+
 #ifdef WITH_OPTIX
 #  include <optix_function_table_definition.h>
 #endif
@@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo
 
     info.type = DEVICE_OPTIX;
     info.id += "_OptiX";
+#  if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1)
+    info.has_osl = true;
+#  endif
     info.denoisers |= DENOISER_OPTIX;
 
     devices.push_back(info);
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index fabf4d7b69d..02f34bf3bd0 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -312,16 +312,34 @@ OptiXDevice::~OptiXDevice()
   if (optix_module != NULL) {
     optixModuleDestroy(optix_module);
   }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     if (builtin_modules[i] != NULL) {
       optixModuleDestroy(builtin_modules[i]);
     }
   }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
     if (pipelines[i] != NULL) {
       optixPipelineDestroy(pipelines[i]);
     }
   }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+    }
+  }
+
+#  ifdef WITH_OSL
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+#  endif
 
   /* Make sure denoiser is destroyed before device context! */
   if (denoiser_.optix_denoiser != nullptr) {
@@ -381,6 +399,12 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     return false;
   }
 
+#  ifdef WITH_OSL
+  const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL);
+#  else
+  const bool use_osl = false;
+#  endif
+
   /* Skip creating OptiX module if only doing denoising. */
   const bool need_optix_kernels = (kernel_features &
                                    (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING));
@@ -388,12 +412,13 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Detect existence of OptiX kernel and SDK here early. So we can error out
    * before compiling the CUDA kernels, to avoid failing right after when
    * compiling the OptiX kernel. */
+  string suffix = use_osl ? "_osl" :
+                  (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
+                            "_shader_raytrace" :
+                            "";
   string ptx_filename;
   if (need_optix_kernels) {
-    ptx_filename = path_get(
-        (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-            "lib/kernel_optix_shader_raytrace.ptx" :
-            "lib/kernel_optix.ptx");
+    ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx");
     if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
       std::string optix_include_dir = get_optix_include_dir();
       if (optix_include_dir.empty()) {
@@ -429,18 +454,41 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     optixModuleDestroy(optix_module);
     optix_module = NULL;
   }
-  for (unsigned int i = 0; i < 2; ++i) {
+  for (int i = 0; i < 2; ++i) {
     if (builtin_modules[i] != NULL) {
       optixModuleDestroy(builtin_modules[i]);
       builtin_modules[i] = NULL;
     }
   }
-  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+  for (int i = 0; i < NUM_PIPELINES; ++i) {
     if (pipelines[i] != NULL) {
       optixPipelineDestroy(pipelines[i]);
       pipelines[i] = NULL;
     }
   }
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    if (groups[i] != NULL) {
+      optixProgramGroupDestroy(groups[i]);
+      groups[i] = NULL;
+    }
+  }
+
+#  ifdef WITH_OSL
+  /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
+  for (const OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+    }
+  }
+  osl_modules.clear();
+
+  for (const OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+    }
+  }
+  osl_groups.clear();
+#  endif
 
   OptixModuleCompileOptions module_options = {};
   module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
@@ -461,7 +509,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   module_options.numPayloadTypes = 0;
 #  endif
 
-  OptixPipelineCompileOptions pipeline_options = {};
   /* Default to no motion blur and two-level graph, since it is the fastest option. */
   pipeline_options.usesMotionBlur = false;
   pipeline_options.traversableGraphFlags =
@@ -490,9 +537,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
    * This is necessary since objects may be reported to have motion if the Vector pass is
    * active, but may still need to be rendered without motion blur if that isn't active as well. */
-  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
-
-  if (motion_blur) {
+  if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
     pipeline_options.usesMotionBlur = true;
     /* Motion blur can insert motion transforms into the traversal graph.
      * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
@@ -503,13 +548,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     string ptx_data;
     if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
       string cflags = compile_kernel_get_common_cflags(kernel_features);
-      ptx_filename = compile_kernel(
-          cflags,
-          (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
-              "kernel_shader_raytrace" :
-              "kernel",
-          "optix",
-          true);
+      ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
     }
     if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
       set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
@@ -551,7 +590,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   }
 
   /* Create program groups. */
-  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
   group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
@@ -609,7 +647,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
       group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
 
-      if (motion_blur) {
+      if (pipeline_options.usesMotionBlur) {
         builtin_options.usesMotionBlur = true;
 
         optix_assert(optixBuiltinISModuleGet(
@@ -630,7 +668,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     }
   }
 
-  /* Pointclouds */
   if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
     group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
     group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
@@ -642,8 +679,8 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
   }
 
+  /* Add hit group for local intersections. */
   if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
-    /* Add hit group for local intersections. */
     group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
     group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
     group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
@@ -655,16 +692,19 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
     group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
     group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
         "__raygen__kernel_optix_integrator_shade_surface_raytrace";
-    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
-    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
-    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
-        "__direct_callable__svm_node_bevel";
+
+    /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */
+    if (!use_osl) {
+      group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+      group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+      group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+      group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+          "__direct_callable__svm_node_bevel";
+    }
   }
 
-  /* MNEE. */
   if (kernel_features & KERNEL_FEATURE_MNEE) {
     group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
     group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
@@ -672,6 +712,42 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
         "__raygen__kernel_optix_integrator_shade_surface_mnee";
   }
 
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl) {
+    group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_background";
+    group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_light";
+    group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface";
+    group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_volume";
+    group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_shadow";
+    group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_displace";
+    group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_background";
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
+    group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
+        "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
+  }
+
   optix_assert(optixProgramGroupCreate(
       context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
 
@@ -680,7 +756,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
   /* Set up SBT, which in this case is used only to select between different programs. */
   sbt_data.alloc(NUM_PROGRAM_GROUPS);
   memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
     optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
     optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
   }
@@ -704,25 +780,26 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
 
   OptixPipelineLinkOptions link_options = {};
   link_options.maxTraceDepth = 1;
+  link_options.debugLevel = module_options.debugLevel;
 
-  if (DebugFlags().optix.use_debug) {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
-  }
-  else {
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
-  }
-
-  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    /* Create shader raytracing pipeline. */
+  if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE) && !use_osl) {
+    /* Create shader raytracing and MNEE pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+      pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+    }
+    if (kernel_features & KERNEL_FEATURE_MNEE) {
+      pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    }
     pipeline_groups.push_back(groups[PG_MISS]);
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
     pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
@@ -730,8 +807,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
       pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
     }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
 
     optix_assert(optixPipelineCreate(context,
                                      &pipeline_options,
@@ -740,30 +815,33 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_SHADE_RAYTRACE]));
+                                     &pipelines[PIP_SHADE]));
 
     /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
                              link_options.maxTraceDepth * trace_css;
     const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
                                       stack_size[PG_CALL_SVM_BEVEL].dssDC);
 
     /* Set stack size depending on pipeline options. */
     optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  if (kernel_features & KERNEL_FEATURE_MNEE) {
-    /* Create MNEE pipeline. */
+  { /* Create intersection-only pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
     pipeline_groups.push_back(groups[PG_MISS]);
     pipeline_groups.push_back(groups[PG_HITD]);
     pipeline_groups.push_back(groups[PG_HITS]);
     pipeline_groups.push_back(groups[PG_HITL]);
     pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       pipeline_groups.push_back(groups[PG_HITD_MOTION]);
       pipeline_groups.push_back(groups[PG_HITS_MOTION]);
     }
@@ -771,8 +849,6 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
       pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
       pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
     }
-    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
-    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
 
     optix_assert(optixPipelineCreate(context,
                                      &pipeline_options,
@@ -781,37 +857,234 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_SHADE_MNEE]));
+                                     &pipelines[PIP_INTERSECT]));
 
-    /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG +
-                             link_options.maxTraceDepth * trace_css;
-    const unsigned int dss = 0;
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
 
-    /* Set stack size depending on pipeline options. */
-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_SHADE_MNEE], 0, dss, css, motion_blur ? 3 : 2));
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  { /* Create intersection-only pipeline. */
+  return !have_error();
+}
+
+bool OptiXDevice::load_osl_kernels()
+{
+#  ifdef WITH_OSL
+  if (have_error()) {
+    return false;
+  }
+
+  struct OSLKernel {
+    string ptx;
+    string init_entry;
+    string exec_entry;
+  };
+
+  /* This has to be in the same order as the ShaderType enum, so that the index calculation in
+   * osl_eval_nodes checks out */
+  vector<OSLKernel> osl_kernels;
+
+  for (ShaderType type = SHADER_TYPE_SURFACE; type <= SHADER_TYPE_BUMP;
+       type = static_cast<ShaderType>(type + 1)) {
+    const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ?
+                                                     osl_globals.surface_state :
+                                                 type == SHADER_TYPE_VOLUME ?
+                                                     osl_globals.volume_state :
+                                                 type == SHADER_TYPE_DISPLACEMENT ?
+                                                     osl_globals.displacement_state :
+                                                     osl_globals.bump_state);
+    for (const OSL::ShaderGroupRef &group : groups) {
+      if (group) {
+        string osl_ptx, init_name, entry_name;
+        osl_globals.ss->getattribute(group.get(), "group_init_name", init_name);
+        osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name);
+        osl_globals.ss->getattribute(
+            group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
+
+        int groupdata_size = 0;
+        osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
+        if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
+          set_error(
+              string_printf("Requested OSL group data size (%d) is greater than the maximum "
+                            "supported with OptiX (2048)",
+                            groupdata_size));
+          return false;
+        }
+
+        osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)});
+      }
+      else {
+        /* Add empty entry for non-existent shader groups, so that the index stays stable. */
+        osl_kernels.emplace_back();
+      }
+    }
+  }
+
+  const CUDAContextScope scope(this);
+
+  if (pipelines[PIP_SHADE]) {
+    optixPipelineDestroy(pipelines[PIP_SHADE]);
+  }
+
+  for (OptixModule &module : osl_modules) {
+    if (module != NULL) {
+      optixModuleDestroy(module);
+      module = NULL;
+    }
+  }
+  for (OptixProgramGroup &group : osl_groups) {
+    if (group != NULL) {
+      optixProgramGroupDestroy(group);
+      group = NULL;
+    }
+  }
+
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  OptixModuleCompileOptions module_options = {};
+  module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+  module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  osl_groups.resize(osl_kernels.size() * 2 + 1);
+  osl_modules.resize(osl_kernels.size() + 1);
+
+  { /* Load and compile PTX module with OSL services. */
+    string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx");
+    if (!path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
+                              ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &osl_modules.back());
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_desc = {};
+    group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
+    group_desc.callables.moduleDC = osl_modules.back();
+
+    optix_assert(optixProgramGroupCreate(
+        context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back()));
+  }
+
+  TaskPool pool;
+  vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+#    if OPTIX_ABI_VERSION >= 55
+    OptixTask task = nullptr;
+    results[i] = optixModuleCreateFromPTXWithTasks(context,
+                                                   &module_options,
+                                                   &pipeline_options,
+                                                   osl_kernels[i].ptx.data(),
+                                                   osl_kernels[i].ptx.size(),
+                                                   nullptr,
+                                                   nullptr,
+                                                   &osl_modules[i],
+                                                   &task);
+    if (results[i] == OPTIX_SUCCESS) {
+      execute_optix_task(pool, task, results[i]);
+    }
+#    else
+    pool.push([this, &results, i, &module_options, &osl_kernels]() {
+      results[i] = optixModuleCreateFromPTX(context,
+                                            &module_options,
+                                            &pipeline_options,
+                                            osl_kernels[i].ptx.data(),
+                                            osl_kernels[i].ptx.size(),
+                                            nullptr,
+                                            0,
+                                            &osl_modules[i]);
+    });
+#    endif
+  }
+
+  pool.wait_work();
+
+  for (size_t i = 0; i < osl_kernels.size(); ++i) {
+    if (osl_kernels[i].ptx.empty()) {
+      continue;
+    }
+
+    if (results[i] != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
+                              osl_kernels[i].init_entry.c_str(),
+                              optixGetErrorName(results[i])));
+      return false;
+    }
+
+    OptixProgramGroupDesc group_descs[2] = {};
+    group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str();
+    group_descs[0].callables.moduleDC = osl_modules[i];
+    group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str();
+    group_descs[1].callables.moduleDC = osl_modules[i];
+
+    optix_assert(optixProgramGroupCreate(
+        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
+  }
+
+  vector<OptixStackSizes> osl_stack_size(osl_groups.size());
+
+  /* Update SBT with new entries. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
+  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+  }
+  for (size_t i = 0; i < osl_groups.size(); ++i) {
+    if (osl_groups[i] != NULL) {
+      optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
+      optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i]));
+    }
+  }
+  sbt_data.copy_to_device(); /* Upload updated SBT to device. */
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 0;
+  link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
+
+  {
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
-    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
-    pipeline_groups.push_back(groups[PG_MISS]);
-    pipeline_groups.push_back(groups[PG_HITD]);
-    pipeline_groups.push_back(groups[PG_HITS]);
-    pipeline_groups.push_back(groups[PG_HITL]);
-    pipeline_groups.push_back(groups[PG_HITV]);
-    if (motion_blur) {
-      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-    }
-    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
-      pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
-      pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
+    pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
+
+    for (const OptixProgramGroup &group : osl_groups) {
+      if (group != NULL) {
+        pipeline_groups.push_back(group);
+      }
     }
 
     optix_assert(optixPipelineCreate(context,
@@ -821,26 +1094,30 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
                                      pipeline_groups.size(),
                                      nullptr,
                                      0,
-                                     &pipelines[PIP_INTERSECT]));
+                                     &pipelines[PIP_SHADE]));
 
-    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
-    const unsigned int css =
-        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
-                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
-                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
-                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
-        link_options.maxTraceDepth * trace_css;
+    unsigned int dss = 0;
+    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
+      dss = std::max(dss, osl_stack_size[i].dssDC);
+    }
 
-    optix_assert(
-        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
   }
 
-  /* Clean up program group objects. */
-  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-    optixProgramGroupDestroy(groups[i]);
-  }
+  return !have_error();
+#  else
+  return false;
+#  endif
+}
 
-  return true;
+void *OptiXDevice::get_cpu_osl_memory()
+{
+#  ifdef WITH_OSL
+  return &osl_globals;
+#  else
+  return NULL;
+#  endif
 }
 
 /* --------------------------------------------------------------------
@@ -1567,7 +1844,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
         num_motion_steps = hair->get_motion_steps();
       }
 
@@ -1721,7 +1998,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+      if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
         num_motion_steps = mesh->get_motion_steps();
       }
 
@@ -1788,7 +2065,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       size_t num_motion_steps = 1;
       Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-      if (motion_blur && pointcloud->get_use_motion_blur() && motion_points) {
+      if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
         num_motion_steps = pointcloud->get_motion_steps();
       }
 
@@ -1885,7 +2162,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
     /* Calculate total motion transform size and allocate memory for them. */
     size_t motion_transform_offset = 0;
-    if (motion_blur) {
+    if (pipeline_options.usesMotionBlur) {
       size_t total_motion_transform_size = 0;
       for (Object *const ob : bvh->objects) {
         if (ob->is_traceable() && ob->use_motion()) {
@@ -1936,7 +2213,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
 
       if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
           static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-        if (motion_blur && ob->get_geometry()->has_motion_blur()) {
+        if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
           /* Select between motion blur and non-motion blur built-in intersection module. */
           instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
         }
@@ -1964,7 +2241,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
       }
 
       /* Insert motion traversable if object has motion. */
-      if (motion_blur && ob->use_motion()) {
+      if (pipeline_options.usesMotionBlur && ob->use_motion()) {
         size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
         size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
                                        motion_keys * sizeof(OptixSRTData);
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index 76c8af9bc3f..ad0e7b93454 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -9,6 +9,7 @@
 #  include "device/cuda/device_impl.h"
 #  include "device/optix/queue.h"
 #  include "device/optix/util.h"
+#  include "kernel/osl/globals.h"
 #  include "kernel/types.h"
 #  include "util/unique_ptr.h"
 
@@ -23,8 +24,16 @@ enum {
   PG_RGEN_INTERSECT_SHADOW,
   PG_RGEN_INTERSECT_SUBSURFACE,
   PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_BACKGROUND,
+  PG_RGEN_SHADE_LIGHT,
+  PG_RGEN_SHADE_SURFACE,
   PG_RGEN_SHADE_SURFACE_RAYTRACE,
   PG_RGEN_SHADE_SURFACE_MNEE,
+  PG_RGEN_SHADE_VOLUME,
+  PG_RGEN_SHADE_SHADOW,
+  PG_RGEN_EVAL_DISPLACE,
+  PG_RGEN_EVAL_BACKGROUND,
+  PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY,
   PG_MISS,
   PG_HITD, /* Default hit group. */
   PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@@ -40,14 +49,14 @@ enum {
 };
 
 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
-static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int NUM_MISS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
 static const int NUM_HIT_PROGRAM_GROUPS = 8;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;
 
 /* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES };
 
 /* A single shader binding table entry. */
 struct SbtRecord {
@@ -61,12 +70,20 @@ class OptiXDevice : public CUDADevice {
   OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
   OptixModule builtin_modules[2] = {};
   OptixPipeline pipelines[NUM_PIPELINES] = {};
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixPipelineCompileOptions pipeline_options = {};
 
-  bool motion_blur = false;
   device_vector<SbtRecord> sbt_data;
   device_only_memory<KernelParamsOptiX> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
 
+#  ifdef WITH_OSL
+  OSLGlobals osl_globals;
+  vector<OptixModule> osl_modules;
+  vector<OptixProgramGroup> osl_groups;
+#  endif
+
+ private:
+  OptixTraversableHandle tlas_handle = 0;
   vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
   thread_mutex delayed_free_bvh_mutex;
 
@@ -100,13 +117,14 @@ class OptiXDevice : public CUDADevice {
   OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
   ~OptiXDevice();
 
- private:
   BVHLayoutMask get_bvh_layout_mask() const override;
 
   string compile_kernel_get_common_cflags(const uint kernel_features);
 
   bool load_kernels(const uint kernel_features) override;
 
+  bool load_osl_kernels() override;
+
   bool build_optix_bvh(BVHOptiX *bvh,
                        OptixBuildOperation operation,
                        const OptixBuildInput &build_input,
@@ -123,6 +141,8 @@ class OptiXDevice : public CUDADevice {
 
   virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
 
+  void *get_cpu_osl_memory() override;
+
   /* --------------------------------------------------------------------
    * Denoising.
    */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 3bc547ed11d..1bfd154d449 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution()
   CUDADeviceQueue::init_execution();
 }
 
-static bool is_optix_specific_kernel(DeviceKernel kernel)
+static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl)
 {
-  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+#  ifdef WITH_OSL
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl && device_kernel_has_shading(kernel)) {
+    return true;
+  }
+#  else
+  (void)use_osl;
+#  endif
+
+  return device_kernel_has_intersection(kernel);
 }
 
 bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                                const int work_size,
                                DeviceKernelArguments const &args)
 {
-  if (!is_optix_specific_kernel(kernel)) {
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+#  ifdef WITH_OSL
+  const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use;
+#  else
+  const bool use_osl = false;
+#  endif
+
+  if (!is_optix_specific_kernel(kernel, use_osl)) {
     return CUDADeviceQueue::enqueue(kernel, work_size, args);
   }
 
@@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
 
   const CUDAContextScope scope(cuda_device_);
 
-  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
-
   const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
   const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
 
@@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                         sizeof(device_ptr),
                         cuda_stream_));
 
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) {
     cuda_device_assert(
         cuda_device_,
         cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                           sizeof(device_ptr),
                           cuda_stream_));
   }
+  if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) {
+    cuda_device_assert(cuda_device_,
+                       cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset),
+                                         args.values[2],  // &d_offset
+                                         sizeof(int32_t),
+                                         cuda_stream_));
+  }
 
   cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
 
@@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
   OptixShaderBindingTable sbt_params = {};
 
   switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord);
+      break;
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
-      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
       break;
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
-      pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
       break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord);
+      break;
+
     case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
       pipeline = optix_device->pipelines[PIP_INTERSECT];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
       break;
 
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr +
+                                PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord);
+      break;
+
     default:
       LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
                  << " is attempted to be enqueued.";
@@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
 
   sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
   sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS;
   sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
   sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
   sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
@@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
   sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
   sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
 
+#  ifdef WITH_OSL
+  if (use_osl) {
+    sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size());
+  }
+#  endif
+
   /* Launch the ray generation program. */
   optix_device_assert(optix_device,
                       optixLaunch(pipeline,
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 3fbb346e94f..99f9e536977 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -37,6 +37,14 @@ set(SRC_KERNEL_DEVICE_OPTIX
   device/optix/kernel_shader_raytrace.cu
 )
 
+if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+  set(SRC_KERNEL_DEVICE_OPTIX
+    ${SRC_KERNEL_DEVICE_OPTIX}
+    osl/services_optix.cu
+    device/optix/kernel_osl.cu
+  )
+endif()
+
 set(SRC_KERNEL_DEVICE_ONEAPI
   device/oneapi/kernel.cpp
 )
@@ -181,6 +189,16 @@ set(SRC_KERNEL_SVM_HEADERS
   svm/vertex_color.h
 )
 
+if(WITH_CYCLES_OSL)
+  set(SRC_KERNEL_OSL_HEADERS
+    osl/osl.h
+    osl/closures_setup.h
+    osl/closures_template.h
+    osl/services_gpu.h
+    osl/types.h
+  )
+endif()
+
 set(SRC_KERNEL_GEOM_HEADERS
   geom/geom.h
   geom/attribute.h
@@ -306,6 +324,7 @@ set(SRC_KERNEL_HEADERS
   ${SRC_KERNEL_GEOM_HEADERS}
   ${SRC_KERNEL_INTEGRATOR_HEADERS}
   ${SRC_KERNEL_LIGHT_HEADERS}
+  ${SRC_KERNEL_OSL_HEADERS}
   ${SRC_KERNEL_SAMPLE_HEADERS}
   ${SRC_KERNEL_SVM_HEADERS}
   ${SRC_KERNEL_TYPES_HEADERS}
@@ -708,6 +727,16 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
     kernel_optix_shader_raytrace
     "device/optix/kernel_shader_raytrace.cu"
     "--keep-device-functions")
+  if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl
+      "device/optix/kernel_osl.cu"
+      "--relocatable-device-code=true")
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl_services
+      "osl/services_optix.cu"
+      "--relocatable-device-code=true")
+  endif()
 
   add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
   cycles_set_solution_folder(cycles_kernel_optix)
@@ -995,6 +1024,7 @@ source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_KERNEL_TYPES_HEADERS})
 source_group("light" FILES ${SRC_KERNEL_LIGHT_HEADERS})
+source_group("osl" FILES ${SRC_KERNEL_OSL_HEADERS})
 source_group("sample" FILES ${SRC_KERNEL_SAMPLE_HEADERS})
 source_group("svm" FILES ${SRC_KERNEL_SVM_HEADERS})
 source_group("util" FILES ${SRC_KERNEL_UTIL_HEADERS})
@@ -1031,6 +1061,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLE
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_LIGHT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/light)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_OSL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/osl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SAMPLE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/sample)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_TYPES_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 71af68aa80e..2f5c5d7bd0c 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -297,8 +297,10 @@ ccl_device_inline void bsdf_roughness_eta(const KernelGlobals kg,
                                           ccl_private float2 *roughness,
                                           ccl_private float *eta)
 {
+#ifdef __SVM__
   bool refractive = false;
   float alpha = 1.0f;
+#endif
   switch (sc->type) {
     case CLOSURE_BSDF_DIFFUSE_ID:
       *roughness = one_float2();
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 51e1381d552..3a950779c11 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -30,6 +30,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */
 
 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #if __CUDA_ARCH__ < 500
 #  define ccl_device_inline __device__ __forceinline__
 #  define ccl_device_forceinline __device__ __forceinline__
@@ -109,14 +110,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D
 
 typedef unsigned short half;
 
-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
   half val;
   asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
   return val;
 }
 
-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
   float val;
   asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 648988c31b6..8755395c82c 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -28,6 +28,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */
 
 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline __device__ __inline__
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index f689e93e5a2..2dd6cc98b59 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -38,6 +38,7 @@ using namespace metal::raytracing;
 #  define ccl_device_noinline ccl_device __attribute__((noinline))
 #endif
 
+#define ccl_device_extern extern "C"
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index dfaec65130c..b83512180d7 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -28,6 +28,7 @@
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device
+#define ccl_device_extern extern "C"
 #define ccl_global
 #define ccl_always_inline __attribute__((always_inline))
 #define ccl_device_inline inline
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index 1a11a533b7e..e13101f57b8 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -33,14 +33,16 @@ typedef unsigned long long uint64_t;
 #endif
 
 #define ccl_device \
-  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+  static __device__ \
+      __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
-#define ccl_device_inline_method ccl_device
-#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_inline_method __device__ __forceinline__
+#define ccl_device_noinline static __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_inline_constant __constant__
+#define ccl_inline_constant static __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -57,23 +59,6 @@ typedef unsigned long long uint64_t;
 
 #define kernel_assert(cond)
 
-/* GPU thread, block, grid size and index */
-
-#define ccl_gpu_thread_idx_x (threadIdx.x)
-#define ccl_gpu_block_dim_x (blockDim.x)
-#define ccl_gpu_block_idx_x (blockIdx.x)
-#define ccl_gpu_grid_dim_x (gridDim.x)
-#define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
-#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
-
-/* GPU warp synchronization. */
-
-#define ccl_gpu_syncthreads() __syncthreads()
-#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
-
 /* GPU texture objects */
 
 typedef unsigned long long CUtexObject;
@@ -101,14 +86,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D
 
 typedef unsigned short half;
 
-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
   half val;
   asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
   return val;
 }
 
-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
   float val;
   asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
index 7af2e421378..126df74bc8c 100644
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -25,6 +25,7 @@ struct KernelParamsOptiX {
   /* Kernel arguments */
   const int *path_index_array;
   float *render_buffer;
+  int offset;
 
   /* Global scene data and textures */
   KernelData data;
@@ -36,7 +37,11 @@ struct KernelParamsOptiX {
 };
 
 #ifdef __NVCC__
-extern "C" static __constant__ KernelParamsOptiX kernel_params;
+extern "C"
+#  ifndef __CUDACC_RDC__
+    static
+#  endif
+    __constant__ KernelParamsOptiX kernel_params;
 #endif
 
 /* Abstraction macros */
diff --git a/intern/cycles/kernel/device/optix/kernel_osl.cu b/intern/cycles/kernel/device/optix/kernel_osl.cu
new file mode 100644
index 00000000000..0f3f477935b
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_osl.cu
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+/* Copy of the regular OptiX kernels with additional OSL support. */
+
+#include "kernel/device/optix/kernel_shader_raytrace.cu"
+
+#include "kernel/bake/bake.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_volume.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_background()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_background(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_light()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_light(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_surface(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_volume()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_volume(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_shadow()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_shadow(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_displace()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_displace_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_background()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_background_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_curve_shadow_transparency()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_curve_shadow_transparency_evaluate(nullptr, input, output, global_index);
+}
diff --git a/intern/cycles/kernel/integrator/displacement_shader.h b/intern/cycles/kernel/integrator/displacement_shader.h
index 839dfe244ac..a6e9d674396 100644
--- a/intern/cycles/kernel/integrator/displacement_shader.h
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@@ -24,8 +24,8 @@ ccl_device void displacement_shader_eval(KernelGlobals kg,
 
   /* this will modify sd->P */
 #ifdef __OSL__
-  if (kg->osl) {
-    OSLShader::eval_displacement(kg, state, sd);
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(kg, state, sd, 0);
   }
   else
 #endif
diff --git a/intern/cycles/kernel/integrator/surface_shader.h b/intern/cycles/kernel/integrator/surface_shader.h
index 6c0097b11bd..5e47a34f77e 100644
--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -827,13 +827,8 @@ ccl_device void surface_shader_eval(KernelGlobals kg,
   sd->num_closure_left = max_closures;
 
 #ifdef __OSL__
-  if (kg->osl) {
-    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, state, sd, path_flag);
-    }
-    else {
-      OSLShader::eval_surface(kg, state, sd, path_flag);
-    }
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_SURFACE>(kg, state, sd, path_flag);
   }
   else
 #endif
diff --git a/intern/cycles/kernel/integrator/volume_shader.h b/intern/cycles/kernel/integrator/volume_shader.h
index 0ff968723a1..f9050647c6d 100644
--- a/intern/cycles/kernel/integrator/volume_shader.h
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@@ -493,8 +493,8 @@ ccl_device_inline void volume_shader_eval(KernelGlobals kg,
 
     /* evaluate shader */
 #  ifdef __OSL__
-    if (kg->osl) {
-      OSLShader::eval_volume(kg, state, sd, path_flag);
+    if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+      osl_eval_nodes<SHADER_TYPE_VOLUME>(kg, state, sd, path_flag);
     }
     else
 #  endif
diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp
index d56e0551a91..6800c765345 100644
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -25,13 +25,18 @@
 
 #include "kernel/osl/osl.h"
 
-#include "kernel/osl/closures_setup.h"
-
 #define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z)
 #define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])
 
 CCL_NAMESPACE_BEGIN
 
+static_assert(sizeof(OSLClosure) == sizeof(OSL::ClosureColor) &&
+              sizeof(OSLClosureAdd) == sizeof(OSL::ClosureAdd) &&
+              sizeof(OSLClosureMul) == sizeof(OSL::ClosureMul) &&
+              sizeof(OSLClosureComponent) == sizeof(OSL::ClosureComponent));
+static_assert(sizeof(ShaderGlobals) == sizeof(OSL::ShaderGlobals) &&
+              offsetof(ShaderGlobals, Ci) == offsetof(OSL::ShaderGlobals, Ci));
+
 /* Registration */
 
 #define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
@@ -60,53 +65,18 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
 #include "closures_template.h"
 }
 
-/* Globals */
+/* Surface & Background */
 
-static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
-                                        ShaderData *sd,
-                                        const void *state,
-                                        uint32_t path_flag,
-                                        OSLThreadData *tdata)
+template<>
+void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
+                                         const void *state,
+                                         ShaderData *sd,
+                                         uint32_t path_flag)
 {
-  OSL::ShaderGlobals *globals = &tdata->globals;
-
-  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
-  const differential3 dI = differential_from_compact(sd->I, sd->dI);
-
-  /* copy from shader data to shader globals */
-  globals->P = TO_VEC3(sd->P);
-  globals->dPdx = TO_VEC3(dP.dx);
-  globals->dPdy = TO_VEC3(dP.dy);
-  globals->I = TO_VEC3(sd->I);
-  globals->dIdx = TO_VEC3(dI.dx);
-  globals->dIdy = TO_VEC3(dI.dy);
-  globals->N = TO_VEC3(sd->N);
-  globals->Ng = TO_VEC3(sd->Ng);
-  globals->u = sd->u;
-  globals->dudx = sd->du.dx;
-  globals->dudy = sd->du.dy;
-  globals->v = sd->v;
-  globals->dvdx = sd->dv.dx;
-  globals->dvdy = sd->dv.dy;
-  globals->dPdu = TO_VEC3(sd->dPdu);
-  globals->dPdv = TO_VEC3(sd->dPdv);
-  globals->surfacearea = 1.0f;
-  globals->time = sd->time;
-
-  /* booleans */
-  globals->raytype = path_flag;
-  globals->flipHandedness = 0;
-  globals->backfacing = (sd->flag & SD_BACKFACING);
-
-  /* shader data to be used in services callbacks */
-  globals->renderstate = sd;
-
-  /* hacky, we leave it to services to fetch actual object matrix */
-  globals->shader2common = sd;
-  globals->object2common = sd;
-
-  /* must be set to NULL before execute */
-  globals->Ci = NULL;
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
 
   /* clear trace data */
   tdata->tracedata.init = false;
@@ -121,53 +91,6 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
     sd->osl_path_state = (const IntegratorStateCPU *)state;
     sd->osl_shadow_path_state = nullptr;
   }
-}
-
-static void flatten_closure_tree(const KernelGlobalsCPU *kg,
-                                 ShaderData *sd,
-                                 uint32_t path_flag,
-                                 const OSL::ClosureColor *closure,
-                                 float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
-  /* OSL gives us a closure tree, we flatten it into arrays per
-   * closure type, for evaluation, sampling, etc later on. */
-
-  switch (closure->id) {
-    case OSL::ClosureColor::MUL: {
-      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-      flatten_closure_tree(kg, sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
-      break;
-    }
-    case OSL::ClosureColor::ADD: {
-      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-      flatten_closure_tree(kg, sd, path_flag, add->closureA, weight);
-      flatten_closure_tree(kg, sd, path_flag, add->closureB, weight);
-      break;
-    }
-#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
-  case OSL_CLOSURE_##Upper##_ID: { \
-    const OSL::ClosureComponent *comp = reinterpret_cast<const OSL::ClosureComponent *>(closure); \
-    weight *= TO_FLOAT3(comp->w); \
-    osl_closure_##lower##_setup( \
-        kg, sd, path_flag, weight, reinterpret_cast<const Upper##Closure *>(comp + 1)); \
-    break; \
-  }
-#include "closures_template.h"
-    default:
-      break;
-  }
-}
-
-/* Surface */
-
-void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
-                             const void *state,
-                             ShaderData *sd,
-                             uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
 
   /* execute shader for this point */
   OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -175,101 +98,99 @@ void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
   OSL::ShadingContext *octx = tdata->context;
   int shader = sd->shader & SHADER_MASK;
 
-  /* automatic bump shader */
-  if (kg->osl->bump_state[shader]) {
-    /* save state */
-    const float3 P = sd->P;
-    const float dP = sd->dP;
-    const OSL::Vec3 dPdx = globals->dPdx;
-    const OSL::Vec3 dPdy = globals->dPdy;
-
-    /* set state as if undisplaced */
-    if (sd->flag & SD_HAS_DISPLACEMENT) {
-      float data[9];
-      bool found = kg->osl->services->get_attribute(sd,
-                                                    true,
-                                                    OSLRenderServices::u_empty,
-                                                    TypeDesc::TypeVector,
-                                                    OSLRenderServices::u_geom_undisplaced,
-                                                    data);
-      (void)found;
-      assert(found);
-
-      differential3 tmp_dP;
-      memcpy(&sd->P, data, sizeof(float) * 3);
-      memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
-      memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
-
-      object_position_transform(kg, sd, &sd->P);
-      object_dir_transform(kg, sd, &tmp_dP.dx);
-      object_dir_transform(kg, sd, &tmp_dP.dy);
-
-      sd->dP = differential_make_compact(tmp_dP);
-
-      globals->P = TO_VEC3(sd->P);
-      globals->dPdx = TO_VEC3(tmp_dP.dx);
-      globals->dPdy = TO_VEC3(tmp_dP.dy);
+  if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+    /* background */
+    if (kg->osl->background_state) {
+      ss->execute(octx, *(kg->osl->background_state), *globals);
     }
-
-    /* execute bump shader */
-    ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
-
-    /* reset state */
-    sd->P = P;
-    sd->dP = dP;
-
-    globals->P = TO_VEC3(P);
-    globals->dPdx = TO_VEC3(dPdx);
-    globals->dPdy = TO_VEC3(dPdy);
   }
+  else {
+    /* automatic bump shader */
+    if (kg->osl->bump_state[shader]) {
+      /* save state */
+      const float3 P = sd->P;
+      const float dP = sd->dP;
+      const OSL::Vec3 dPdx = globals->dPdx;
+      const OSL::Vec3 dPdy = globals->dPdy;
+
+      /* set state as if undisplaced */
+      if (sd->flag & SD_HAS_DISPLACEMENT) {
+        float data[9];
+        bool found = kg->osl->services->get_attribute(sd,
+                                                      true,
+                                                      OSLRenderServices::u_empty,
+                                                      TypeDesc::TypeVector,
+                                                      OSLRenderServices::u_geom_undisplaced,
+                                                      data);
+        (void)found;
+        assert(found);
+
+        differential3 tmp_dP;
+        memcpy(&sd->P, data, sizeof(float) * 3);
+        memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
+        memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
+
+        object_position_transform(kg, sd, &sd->P);
+        object_dir_transform(kg, sd, &tmp_dP.dx);
+        object_dir_transform(kg, sd, &tmp_dP.dy);
+
+        sd->dP = differential_make_compact(tmp_dP);
+
+        globals->P = TO_VEC3(sd->P);
+        globals->dPdx = TO_VEC3(tmp_dP.dx);
+        globals->dPdy = TO_VEC3(tmp_dP.dy);
+      }
+
+      /* execute bump shader */
+      ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
+
+      /* reset state */
+      sd->P = P;
+      sd->dP = dP;
+
+      globals->P = TO_VEC3(P);
+      globals->dPdx = TO_VEC3(dPdx);
+      globals->dPdy = TO_VEC3(dPdy);
+    }
 
-  /* surface shader */
-  if (kg->osl->surface_state[shader]) {
-    ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    /* surface shader */
+    if (kg->osl->surface_state[shader]) {
+      ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    }
   }
 
   /* flatten closure tree */
   if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
   }
 }
 
-/* Background */
+/* Volume */
 
-void OSLShader::eval_background(const KernelGlobalsCPU *kg,
-                                const void *state,
-                                ShaderData *sd,
-                                uint32_t path_flag)
+template<>
+void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
+                                        const void *state,
+                                        ShaderData *sd,
+                                        uint32_t path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
 
-  /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
+  /* clear trace data */
+  tdata->tracedata.init = false;
 
-  if (kg->osl->background_state) {
-    ss->execute(octx, *(kg->osl->background_state), *globals);
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_path_state = nullptr;
+    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
   }
-
-  /* return background color immediately */
-  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+  else {
+    sd->osl_path_state = (const IntegratorStateCPU *)state;
+    sd->osl_shadow_path_state = nullptr;
   }
-}
-
-/* Volume */
-
-void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
-                            const void *state,
-                            ShaderData *sd,
-                            uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
 
   /* execute shader */
   OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -283,17 +204,30 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
 
   /* flatten closure tree */
   if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
   }
 }
 
 /* Displacement */
 
-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
+template<>
+void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
+                                              const void *state,
+                                              ShaderData *sd,
+                                              uint32_t path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+
+  /* clear trace data */
+  tdata->tracedata.init = false;
+
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  sd->osl_path_state = (const IntegratorStateCPU *)state;
+  sd->osl_shadow_path_state = nullptr;
 
   /* execute shader */
   OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
diff --git a/intern/cycles/kernel/osl/closures_setup.h b/intern/cycles/kernel/osl/closures_setup.h
index 96c551b9951..ceaf56ccba6 100644
--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -40,12 +40,7 @@ CCL_NAMESPACE_BEGIN
     const char *label;
 #define OSL_CLOSURE_STRUCT_END(Upper, lower) \
   } \
-  ; \
-  ccl_device void osl_closure_##lower##_setup(KernelGlobals kg, \
-                                              ccl_private ShaderData *sd, \
-                                              uint32_t path_flag, \
-                                              float3 weight, \
-                                              ccl_private Upper##Closure *closure);
+  ;
 #define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) type name;
 #define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) type name[size];
 
@@ -210,11 +205,9 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
   bsdf->ior = closure->ior;
   bsdf->T = closure->T;
 
-  static OSL::ustring u_ggx("ggx");
-  static OSL::ustring u_default("default");
-
   /* GGX */
-  if (closure->distribution == u_ggx || closure->distribution == u_default) {
+  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
+      closure->distribution == make_string("default", 4430693559278735917ull)) {
     if (!closure->refract) {
       if (closure->alpha_x == closure->alpha_y) {
         /* Isotropic */
@@ -1000,18 +993,14 @@ ccl_device void osl_closure_bssrdf_setup(KernelGlobals kg,
                                          float3 weight,
                                          ccl_private const BSSRDFClosure *closure)
 {
-  static ustring u_burley("burley");
-  static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
-  static ustring u_random_walk("random_walk");
-
   ClosureType type;
-  if (closure->method == u_burley) {
+  if (closure->method == make_string("burley", 186330084368958868ull)) {
     type = CLOSURE_BSSRDF_BURLEY_ID;
   }
-  else if (closure->method == u_random_walk_fixed_radius) {
+  else if (closure->method == make_string("random_walk_fixed_radius", 5695810351010063150ull)) {
     type = CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID;
   }
-  else if (closure->method == u_random_walk) {
+  else if (closure->method == make_string("random_walk", 11360609267673527222ull)) {
     type = CLOSURE_BSSRDF_RANDOM_WALK_ID;
   }
   else {
diff --git a/intern/cycles/kernel/osl/closures_template.h b/intern/cycles/kernel/osl/closures_template.h
index c808b275966..b9e9b52dcf8 100644
--- a/intern/cycles/kernel/osl/closures_template.h
+++ b/intern/cycles/kernel/osl/closures_template.h
@@ -40,7 +40,7 @@ OSL_CLOSURE_STRUCT_BEGIN(Transparent, transparent)
 OSL_CLOSURE_STRUCT_END(Transparent, transparent)
 
 OSL_CLOSURE_STRUCT_BEGIN(Microfacet, microfacet)
-  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, ustring, distribution, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, DeviceString, distribution, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, N, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, T, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_x, NULL)
@@ -210,7 +210,7 @@ OSL_CLOSURE_STRUCT_BEGIN(PhongRamp, phong_ramp)
 OSL_CLOSURE_STRUCT_END(PhongRamp, phong_ramp)
 
 OSL_CLOSURE_STRUCT_BEGIN(BSSRDF, bssrdf)
-  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, ustring, method, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, DeviceString, method, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, N, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, radius, NULL)
   OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, albedo, NULL)
diff --git a/intern/cycles/kernel/osl/osl.h b/intern/cycles/kernel/osl/osl.h
index bef23f3eea1..cc5c81ad027 100644
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -1,38 +1,171 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */
 
 #pragma once
 
 /* OSL Shader Engine
  *
- * Holds all variables to execute and use OSL shaders from the kernel. These
- * are initialized externally by OSLShaderManager before rendering starts.
- *
- * Before/after a thread starts rendering, thread_init/thread_free must be
- * called, which will store any per thread OSL state in thread local storage.
- * This means no thread state must be passed along in the kernel itself.
+ * Holds all variables to execute and use OSL shaders from the kernel.
  */
 
 #include "kernel/osl/types.h"
 
+#include "kernel/osl/closures_setup.h"
+
 CCL_NAMESPACE_BEGIN
 
-class OSLShader {
- public:
-  /* eval */
-  static void eval_surface(const KernelGlobalsCPU *kg,
-                           const void *state,
-                           ShaderData *sd,
-                           uint32_t path_flag);
-  static void eval_background(const KernelGlobalsCPU *kg,
-                              const void *state,
-                              ShaderData *sd,
-                              uint32_t path_flag);
-  static void eval_volume(const KernelGlobalsCPU *kg,
-                          const void *state,
-                          ShaderData *sd,
-                          uint32_t path_flag);
-  static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);
-};
+ccl_device_inline void shaderdata_to_shaderglobals(KernelGlobals kg,
+                                                   ccl_private ShaderData *sd,
+                                                   uint32_t path_flag,
+                                                   ccl_private ShaderGlobals *globals)
+{
+  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+  const differential3 dI = differential_from_compact(sd->I, sd->dI);
+
+  /* copy from shader data to shader globals */
+  globals->P = sd->P;
+  globals->dPdx = dP.dx;
+  globals->dPdy = dP.dy;
+  globals->I = sd->I;
+  globals->dIdx = dI.dx;
+  globals->dIdy = dI.dy;
+  globals->N = sd->N;
+  globals->Ng = sd->Ng;
+  globals->u = sd->u;
+  globals->dudx = sd->du.dx;
+  globals->dudy = sd->du.dy;
+  globals->v = sd->v;
+  globals->dvdx = sd->dv.dx;
+  globals->dvdy = sd->dv.dy;
+  globals->dPdu = sd->dPdu;
+  globals->dPdv = sd->dPdv;
+  globals->time = sd->time;
+  globals->dtime = 1.0f;
+  globals->surfacearea = 1.0f;
+  globals->raytype = path_flag;
+  globals->flipHandedness = 0;
+  globals->backfacing = (sd->flag & SD_BACKFACING);
+
+  /* shader data to be used in services callbacks */
+  globals->renderstate = sd;
+
+  /* hacky, we leave it to services to fetch actual object matrix */
+  globals->shader2common = sd;
+  globals->object2common = sd;
+
+  /* must be set to NULL before execute */
+  globals->Ci = nullptr;
+}
+
+ccl_device void flatten_closure_tree(KernelGlobals kg,
+                                     ccl_private ShaderData *sd,
+                                     uint32_t path_flag,
+                                     ccl_private const OSLClosure *closure)
+{
+  int stack_size = 0;
+  float3 weight = one_float3();
+  float3 weight_stack[16];
+  ccl_private const OSLClosure *closure_stack[16];
+
+  while (closure) {
+    switch (closure->id) {
+      case OSL_CLOSURE_MUL_ID: {
+        ccl_private const OSLClosureMul *mul = static_cast<ccl_private const OSLClosureMul *>(
+            closure);
+        weight *= mul->weight;
+        closure = mul->closure;
+        continue;
+      }
+      case OSL_CLOSURE_ADD_ID: {
+        if (stack_size >= 16) {
+          kernel_assert(!"Exhausted OSL closure stack");
+          break;
+        }
+        ccl_private const OSLClosureAdd *add = static_cast<ccl_private const OSLClosureAdd *>(
+            closure);
+        closure = add->closureA;
+        weight_stack[stack_size] = weight;
+        closure_stack[stack_size++] = add->closureB;
+        continue;
+      }
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  case OSL_CLOSURE_##Upper##_ID: { \
+    ccl_private const OSLClosureComponent *comp = \
+        static_cast<ccl_private const OSLClosureComponent *>(closure); \
+    osl_closure_##lower##_setup(kg, \
+                                sd, \
+                                path_flag, \
+                                weight * comp->weight, \
+                                reinterpret_cast<ccl_private const Upper##Closure *>(comp + 1)); \
+    break; \
+  }
+#include "closures_template.h"
+      default:
+        break;
+    }
+
+    if (stack_size > 0) {
+      weight = weight_stack[--stack_size];
+      closure = closure_stack[stack_size];
+    }
+    else {
+      closure = nullptr;
+    }
+  }
+}
+
+#ifndef __KERNEL_GPU__
+
+template<ShaderType type>
+void osl_eval_nodes(const KernelGlobalsCPU *kg,
+                    const void *state,
+                    ShaderData *sd,
+                    uint32_t path_flag);
+
+#else
+
+template<ShaderType type, typename ConstIntegratorGenericState>
+ccl_device_inline void osl_eval_nodes(KernelGlobals kg,
+                                      ConstIntegratorGenericState state,
+                                      ccl_private ShaderData *sd,
+                                      uint32_t path_flag)
+{
+  ShaderGlobals globals;
+  shaderdata_to_shaderglobals(kg, sd, path_flag, &globals);
+
+  const int shader = sd->shader & SHADER_MASK;
+
+#  ifdef __KERNEL_OPTIX__
+  uint8_t group_data[2048];
+  uint8_t closure_pool[1024];
+  sd->osl_closure_pool = closure_pool;
+
+  unsigned int optix_dc_index = 2 /* NUM_CALLABLE_PROGRAM_GROUPS */ +
+                                (shader + type * kernel_data.max_shaders) * 2;
+  optixDirectCall<void>(optix_dc_index + 0,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+  optixDirectCall<void>(optix_dc_index + 1,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+#  endif
+
+  if (globals.Ci) {
+    flatten_closure_tree(kg, sd, path_flag, globals.Ci);
+  }
+}
+
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index b744422ee78..454b75ea4d9 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -119,8 +119,8 @@ ustring OSLRenderServices::u_u("u");
 ustring OSLRenderServices::u_v("v");
 ustring OSLRenderServices::u_empty;
 
-OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system)
-    : OSL::RendererServices(texture_system)
+OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system, int device_type)
+    : OSL::RendererServices(texture_system), device_type_(device_type)
 {
 }
 
@@ -131,6 +131,17 @@ OSLRenderServices::~OSLRenderServices()
   }
 }
 
+int OSLRenderServices::supports(string_view feature) const
+{
+#ifdef WITH_OPTIX
+  if (feature == "OptiX") {
+    return device_type_ == DEVICE_OPTIX;
+  }
+#endif
+
+  return false;
+}
+
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    OSL::Matrix44 &result,
                                    OSL::TransformationPtr xform,
@@ -1139,29 +1150,39 @@ TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring file
 {
   OSLTextureHandleMap::iterator it = textures.find(filename);
 
-  /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
-  if (it != textures.end()) {
-    if (it->second->type != OSLTextureHandle::OIIO) {
-      return (TextureSystem::TextureHandle *)it->second.get();
+  if (device_type_ == DEVICE_CPU) {
+    /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
+    if (it != textures.end()) {
+      if (it->second->type != OSLTextureHandle::OIIO) {
+        return (TextureSystem::TextureHandle *)it->second.get();
+      }
     }
-  }
 
-  /* Get handle from OpenImageIO. */
-  OSL::TextureSystem *ts = m_texturesys;
-  TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
-  if (handle == NULL) {
-    return NULL;
-  }
+    /* Get handle from OpenImageIO. */
+    OSL::TextureSystem *ts = m_texturesys;
+    TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
+    if (handle == NULL) {
+      return NULL;
+    }
+
+    /* Insert new OSLTextureHandle if needed. */
+    if (it == textures.end()) {
+      textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
+      it = textures.find(filename);
+    }
 
-  /* Insert new OSLTextureHandle if needed. */
-  if (it == textures.end()) {
-    textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
-    it = textures.find(filename);
+    /* Assign OIIO texture handle and return. */
+    it->second->oiio_handle = handle;
+    return (TextureSystem::TextureHandle *)it->second.get();
   }
+  else {
+    if (it != textures.end() && it->second->type == OSLTextureHandle::SVM && it->second->svm_slots[0].w == -1) {
+        return reinterpret_cast<TextureSystem::TextureHandle *>(
+            static_cast<uintptr_t>(it->second->svm_slots[0].y + 1));
+    }
 
-  /* Assign OIIO texture handle and return. */
-  it->second->oiio_handle = handle;
-  return (TextureSystem::TextureHandle *)it->second.get();
+    return NULL;
+  }
 }
 
 bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h
index 334b6682e34..9d875ae8e94 100644
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -22,11 +22,8 @@ class PtexCache;
 
 CCL_NAMESPACE_BEGIN
 
-class Object;
 class Scene;
-class Shader;
 struct ShaderData;
-struct float3;
 struct KernelGlobalsCPU;
 
 /* OSL Texture Handle
@@ -73,11 +70,13 @@ typedef OIIO::unordered_map_concurrent<ustring, OSLTextureHandleRef, ustringHash
 
 class OSLRenderServices : public OSL::RendererServices {
  public:
-  OSLRenderServices(OSL::TextureSystem *texture_system);
+  OSLRenderServices(OSL::TextureSystem *texture_system, int device_type);
   ~OSLRenderServices();
 
   static void register_closures(OSL::ShadingSystem *ss);
 
+  int supports(string_view feature) const override;
+
   bool get_matrix(OSL::ShaderGlobals *sg,
                   OSL::Matrix44 &result,
                   OSL::TransformationPtr xform,
@@ -324,6 +323,9 @@ class OSLRenderServices : public OSL::RendererServices {
    * and is required because texture handles are cached as part of the shared
    * shading system. */
   OSLTextureHandleMap textures;
+
+ private:
+  int device_type_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/services_gpu.h b/intern/cycles/kernel/osl/services_gpu.h
new file mode 100644
index 00000000000..e6e19b8c484
--- /dev/null
+++ b/intern/cycles/kernel/osl/services_gpu.h
@@ -0,0 +1,2149 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */
+
+#include "kernel/tables.h"
+#include "kernel/util/differential.h"
+
+#include "kernel/osl/osl.h"
+
+namespace DeviceStrings {
+
+/* "" */
+ccl_device_constant DeviceString _emptystring_ = {0ull};
+/* "NDC" */
+ccl_device_constant DeviceString u_ndc = {5148305047403260775ull};
+/* "screen" */
+ccl_device_constant DeviceString u_screen = {14159088609039777114ull};
+/* "camera" */
+ccl_device_constant DeviceString u_camera = {2159505832145726196ull};
+/* "raster" */
+ccl_device_constant DeviceString u_raster = {7759263238610201778ull};
+/* "world" */
+ccl_device_constant DeviceString u_world = {16436542438370751598ull};
+/* "common" */
+ccl_device_constant DeviceString u_common = {14645198576927606093ull};
+/* "hsv" */
+ccl_device_constant DeviceString u_hsv = {2177035556331879497ull};
+/* "hsl" */
+ccl_device_constant DeviceString u_hsl = {7749766809258288148ull};
+/* "XYZ" */
+ccl_device_constant DeviceString u_xyz = {4957977063494975483ull};
+/* "xyY" */
+ccl_device_constant DeviceString u_xyy = {5138822319725660255ull};
+/* "sRGB" */
+ccl_device_constant DeviceString u_srgb = {15368599878474175032ull};
+/* "object:location" */
+ccl_device_constant DeviceString u_object_location = {7846190347358762897ull};
+/* "object:color" */
+ccl_device_constant DeviceString u_object_color = {12695623857059169556ull};
+/* "object:alpha" */
+ccl_device_constant DeviceString u_object_alpha = {11165053919428293151ull};
+/* "object:index" */
+ccl_device_constant DeviceString u_object_index = {6588325838217472556ull};
+/* "geom:dupli_generated" */
+ccl_device_constant DeviceString u_geom_dupli_generated = {6715607178003388908ull};
+/* "geom:dupli_uv" */
+ccl_device_constant DeviceString u_geom_dupli_uv = {1294253317490155849ull};
+/* "material:index" */
+ccl_device_constant DeviceString u_material_index = {741770758159634623ull};
+/* "object:random" */
+ccl_device_constant DeviceString u_object_random = {15789063994977955884ull};
+/* "particle:index" */
+ccl_device_constant DeviceString u_particle_index = {9489711748229903784ull};
+/* "particle:random" */
+ccl_device_constant DeviceString u_particle_random = {17993722202766855761ull};
+/* "particle:age" */
+ccl_device_constant DeviceString u_particle_age = {7380730644710951109ull};
+/* "particle:lifetime" */
+ccl_device_constant DeviceString u_particle_lifetime = {16576828923156200061ull};
+/* "particle:location" */
+ccl_device_constant DeviceString u_particle_location = {10309536211423573010ull};
+/* "particle:rotation" */
+ccl_device_constant DeviceString u_particle_rotation = {17858543768041168459ull};
+/* "particle:size" */
+ccl_device_constant DeviceString u_particle_size = {16461524249715420389ull};
+/* "particle:velocity" */
+ccl_device_constant DeviceString u_particle_velocity = {13199101248768308863ull};
+/* "particle:angular_velocity" */
+ccl_device_constant DeviceString u_particle_angular_velocity = {16327930120486517910ull};
+/* "geom:numpolyvertices" */
+ccl_device_constant DeviceString u_geom_numpolyvertices = {382043551489988826ull};
+/* "geom:trianglevertices" */
+ccl_device_constant DeviceString u_geom_trianglevertices = {17839267571524187074ull};
+/* "geom:polyvertices" */
+ccl_device_constant DeviceString u_geom_polyvertices = {1345577201967881769ull};
+/* "geom:name" */
+ccl_device_constant DeviceString u_geom_name = {13606338128269760050ull};
+/* "geom:undisplaced" */
+ccl_device_constant DeviceString u_geom_undisplaced = {12431586303019276305ull};
+/* "geom:is_smooth" */
+ccl_device_constant DeviceString u_is_smooth = {857544214094480123ull};
+/* "geom:is_curve" */
+ccl_device_constant DeviceString u_is_curve = {129742495633653138ull};
+/* "geom:curve_thickness" */
+ccl_device_constant DeviceString u_curve_thickness = {10605802038397633852ull};
+/* "geom:curve_length" */
+ccl_device_constant DeviceString u_curve_length = {11423459517663715453ull};
+/* "geom:curve_tangent_normal" */
+ccl_device_constant DeviceString u_curve_tangent_normal = {12301397394034985633ull};
+/* "geom:curve_random" */
+ccl_device_constant DeviceString u_curve_random = {15293085049960492358ull};
+/* "geom:is_point" */
+ccl_device_constant DeviceString u_is_point = {2511357849436175953ull};
+/* "geom:point_radius" */
+ccl_device_constant DeviceString u_point_radius = {9956381140398668479ull};
+/* "geom:point_position" */
+ccl_device_constant DeviceString u_point_position = {15684484280742966916ull};
+/* "geom:point_random" */
+ccl_device_constant DeviceString u_point_random = {5632627207092325544ull};
+/* "geom:normal_map_normal" */
+ccl_device_constant DeviceString u_normal_map_normal = {10718948685686827073};
+/* "path:ray_length" */
+ccl_device_constant DeviceString u_path_ray_length = {16391985802412544524ull};
+/* "path:ray_depth" */
+ccl_device_constant DeviceString u_path_ray_depth = {16643933224879500399ull};
+/* "path:diffuse_depth" */
+ccl_device_constant DeviceString u_path_diffuse_depth = {13191651286699118408ull};
+/* "path:glossy_depth" */
+ccl_device_constant DeviceString u_path_glossy_depth = {15717768399057252940ull};
+/* "path:transparent_depth" */
+ccl_device_constant DeviceString u_path_transparent_depth = {7821650266475578543ull};
+/* "path:transmission_depth" */
+ccl_device_constant DeviceString u_path_transmission_depth = {15113408892323917624ull};
+
+}  // namespace DeviceStrings
+
+/* Closure */
+
+ccl_device_extern ccl_private OSLClosure *osl_mul_closure_color(ccl_private ShaderGlobals *sg,
+                                                                ccl_private OSLClosure *a,
+                                                                ccl_private const float3 *weight)
+{
+  if (*weight == zero_float3() || !a) {
+    return nullptr;
+  }
+  else if (*weight == one_float3()) {
+    return a;
+  }
+
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) &
+      (-alignof(OSLClosureMul)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul);
+
+  ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>(
+      closure_pool);
+  closure->id = OSL_CLOSURE_MUL_ID;
+  closure->weight = *weight;
+  closure->closure = a;
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_mul_closure_float(ccl_private ShaderGlobals *sg,
+                                                                ccl_private OSLClosure *a,
+                                                                float weight)
+{
+  if (weight == 0.0f || !a) {
+    return nullptr;
+  }
+  else if (weight == 1.0f) {
+    return a;
+  }
+
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureMul) - 1) &
+      (-alignof(OSLClosureMul)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureMul);
+
+  ccl_private OSLClosureMul *const closure = reinterpret_cast<ccl_private OSLClosureMul *>(
+      closure_pool);
+  closure->id = OSL_CLOSURE_MUL_ID;
+  closure->weight = make_float3(weight, weight, weight);
+  closure->closure = a;
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_add_closure_closure(ccl_private ShaderGlobals *sg,
+                                                                  ccl_private OSLClosure *a,
+                                                                  ccl_private OSLClosure *b)
+{
+  if (!a) {
+    return b;
+  }
+  if (!b) {
+    return a;
+  }
+
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureAdd) - 1) &
+      (-alignof(OSLClosureAdd)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureAdd);
+
+  ccl_private OSLClosureAdd *const closure = reinterpret_cast<ccl_private OSLClosureAdd *>(
+      closure_pool);
+  closure->id = OSL_CLOSURE_ADD_ID;
+  closure->closureA = a;
+  closure->closureB = b;
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_allocate_closure_component(
+    ccl_private ShaderGlobals *sg, int id, int size)
+{
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) &
+      (-alignof(OSLClosureComponent)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size;
+
+  ccl_private OSLClosureComponent *const closure =
+      reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool);
+  closure->id = static_cast<OSLClosureType>(id);
+  closure->weight = one_float3();
+
+  return closure;
+}
+
+ccl_device_extern ccl_private OSLClosure *osl_allocate_weighted_closure_component(
+    ccl_private ShaderGlobals *sg, int id, int size, ccl_private const float3 *weight)
+{
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+
+  ccl_private uint8_t *closure_pool = sd->osl_closure_pool;
+  /* Align pointer to closure struct requirement */
+  closure_pool = reinterpret_cast<uint8_t *>(
+      (reinterpret_cast<size_t>(closure_pool) + alignof(OSLClosureComponent) - 1) &
+      (-alignof(OSLClosureComponent)));
+  sd->osl_closure_pool = closure_pool + sizeof(OSLClosureComponent) + size;
+
+  ccl_private OSLClosureComponent *const closure =
+      reinterpret_cast<ccl_private OSLClosureComponent *>(closure_pool);
+  closure->id = static_cast<OSLClosureType>(id);
+  closure->weight = *weight;
+
+  return closure;
+}
+
+/* Utilities */
+
+#include "kernel/svm/math_util.h"
+#include "kernel/util/color.h"
+
+ccl_device_extern void osl_error(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern void osl_printf(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern void osl_warning(ccl_private ShaderGlobals *sg, const char *format, void *args)
+{
+}
+
+ccl_device_extern uint osl_range_check(int indexvalue,
+                                       int length,
+                                       DeviceString symname,
+                                       ccl_private ShaderGlobals *sg,
+                                       DeviceString sourcefile,
+                                       int sourceline,
+                                       DeviceString groupname,
+                                       int layer,
+                                       DeviceString layername,
+                                       DeviceString shadername)
+{
+  const int result = indexvalue < 0 ? 0 : indexvalue >= length ? length - 1 : indexvalue;
+#if 0
+  if (result != indexvalue) {
+    printf("Index [%d] out of range\n", indexvalue);
+  }
+#endif
+  return result;
+}
+
+ccl_device_extern uint osl_range_check_err(int indexvalue,
+                                           int length,
+                                           DeviceString symname,
+                                           ccl_private ShaderGlobals *sg,
+                                           DeviceString sourcefile,
+                                           int sourceline,
+                                           DeviceString groupname,
+                                           int layer,
+                                           DeviceString layername,
+                                           DeviceString shadername)
+{
+  return osl_range_check(indexvalue,
+                         length,
+                         symname,
+                         sg,
+                         sourcefile,
+                         sourceline,
+                         groupname,
+                         layer,
+                         layername,
+                         shadername);
+}
+
+/* Color Utilities */
+
+ccl_device_extern void osl_blackbody_vf(ccl_private ShaderGlobals *sg,
+                                        ccl_private float3 *result,
+                                        float temperature)
+{
+  float3 color_rgb = rec709_to_rgb(nullptr, svm_math_blackbody_color_rec709(temperature));
+  color_rgb = max(color_rgb, zero_float3());
+  *result = color_rgb;
+}
+
+#if 0
+ccl_device_extern void osl_wavelength_color_vf(ccl_private ShaderGlobals *sg,
+                                                   ccl_private float3 *result,
+                                                   float wavelength)
+{
+}
+#endif
+
+ccl_device_extern void osl_luminance_fv(ccl_private ShaderGlobals *sg,
+                                        ccl_private float *result,
+                                        ccl_private float3 *color)
+{
+  *result = linear_rgb_to_gray(nullptr, *color);
+}
+
+ccl_device_extern void osl_luminance_dfdv(ccl_private ShaderGlobals *sg,
+                                          ccl_private float *result,
+                                          ccl_private float3 *color)
+{
+  for (int i = 0; i < 3; ++i) {
+    osl_luminance_fv(sg, result + i, color + i);
+  }
+}
+
+ccl_device_extern void osl_prepend_color_from(ccl_private ShaderGlobals *sg,
+                                              ccl_private float3 *res,
+                                              DeviceString from)
+{
+  if (from == DeviceStrings::u_hsv) {
+    *res = hsv_to_rgb(*res);
+  }
+  else if (from == DeviceStrings::u_hsl) {
+    *res = hsl_to_rgb(*res);
+  }
+  else if (from == DeviceStrings::u_xyz) {
+    *res = xyz_to_rgb(nullptr, *res);
+  }
+  else if (from == DeviceStrings::u_xyy) {
+    *res = xyz_to_rgb(nullptr, xyY_to_xyz(res->x, res->y, res->z));
+  }
+}
+
+ccl_device_extern bool osl_transformc(ccl_private ShaderGlobals *sg,
+                                      ccl_private float3 *c_in,
+                                      int c_in_derivs,
+                                      ccl_private float3 *c_out,
+                                      int c_out_derivs,
+                                      DeviceString from,
+                                      DeviceString to)
+{
+  if (!c_out_derivs) {
+    c_in_derivs = false;
+  }
+  else if (!c_in_derivs) {
+    c_out[1] = zero_float3();
+    c_out[2] = zero_float3();
+  }
+
+  float3 rgb;
+
+  for (int i = 0; i < (c_in_derivs ? 3 : 1); ++i) {
+    if (from == DeviceStrings::u_hsv) {
+      rgb = hsv_to_rgb(c_in[i]);
+    }
+    else if (from == DeviceStrings::u_hsl) {
+      rgb = hsl_to_rgb(c_in[i]);
+    }
+    else if (from == DeviceStrings::u_xyz) {
+      rgb = xyz_to_rgb(nullptr, c_in[i]);
+    }
+    else if (from == DeviceStrings::u_xyy) {
+      rgb = xyz_to_rgb(nullptr, xyY_to_xyz(c_in[i].x, c_in[i].y, c_in[i].z));
+    }
+    else if (from == DeviceStrings::u_srgb) {
+      rgb = color_srgb_to_linear_v3(c_in[i]);
+    }
+    else {
+      rgb = c_in[i];
+    }
+
+    if (to == DeviceStrings::u_hsv) {
+      c_out[i] = rgb_to_hsv(rgb);
+    }
+    else if (to == DeviceStrings::u_hsl) {
+      c_out[i] = rgb_to_hsl(rgb);
+    }
+#if 0
+    else if (to == DeviceStrings::u_xyz) {
+      c_out[i] = rgb_to_xyz(nullptr, rgb);
+    }
+    else if (to == DeviceStrings::u_xyy) {
+      c_out[i] = xyz_to_xyY(rgb_to_xyz(nullptr, rgb));
+    }
+#endif
+    else if (to == DeviceStrings::u_srgb) {
+      c_out[i] = color_linear_to_srgb_v3(rgb);
+    }
+    else {
+      c_out[i] = rgb;
+    }
+  }
+}
+
+/* Matrix Utilities */
+
+#include "util/transform.h"
+
+ccl_device_forceinline void copy_matrix(ccl_private float *res, const Transform &tfm)
+{
+  res[0] = tfm.x.x;
+  res[1] = tfm.y.x;
+  res[2] = tfm.z.x;
+  res[3] = 0.0f;
+  res[4] = tfm.x.y;
+  res[5] = tfm.y.y;
+  res[6] = tfm.z.y;
+  res[7] = 0.0f;
+  res[8] = tfm.x.z;
+  res[9] = tfm.y.z;
+  res[10] = tfm.z.z;
+  res[11] = 0.0f;
+  res[12] = tfm.x.w;
+  res[13] = tfm.y.w;
+  res[14] = tfm.z.w;
+  res[15] = 1.0f;
+}
+ccl_device_forceinline void copy_matrix(ccl_private float *res, const ProjectionTransform &tfm)
+{
+  res[0] = tfm.x.x;
+  res[1] = tfm.y.x;
+  res[2] = tfm.z.x;
+  res[3] = tfm.w.x;
+  res[4] = tfm.x.y;
+  res[5] = tfm.y.y;
+  res[6] = tfm.z.y;
+  res[7] = tfm.w.y;
+  res[8] = tfm.x.z;
+  res[9] = tfm.y.z;
+  res[10] = tfm.z.z;
+  res[11] = tfm.w.z;
+  res[12] = tfm.x.w;
+  res[13] = tfm.y.w;
+  res[14] = tfm.z.w;
+  res[15] = tfm.w.w;
+}
+ccl_device_forceinline void copy_identity_matrix(ccl_private float *res)
+{
+  res[0] = 1.0f;
+  res[1] = 0.0f;
+  res[2] = 0.0f;
+  res[3] = 0.0f;
+  res[4] = 0.0f;
+  res[5] = 1.0f;
+  res[6] = 0.0f;
+  res[7] = 0.0f;
+  res[8] = 0.0f;
+  res[9] = 0.0f;
+  res[10] = 1.0f;
+  res[11] = 0.0f;
+  res[12] = 0.0f;
+  res[13] = 0.0f;
+  res[14] = 0.0f;
+  res[15] = 1.0f;
+}
+ccl_device_forceinline Transform convert_transform(ccl_private const float *m)
+{
+  return make_transform(
+      m[0], m[4], m[8], m[12], m[1], m[5], m[9], m[13], m[2], m[6], m[10], m[14]);
+}
+
+ccl_device_extern void osl_mul_mmm(ccl_private float *res,
+                                   ccl_private const float *a,
+                                   ccl_private const float *b)
+{
+  const Transform tfm_a = convert_transform(a);
+  const Transform tfm_b = convert_transform(b);
+  copy_matrix(res, tfm_a * tfm_b);
+}
+
+ccl_device_extern void osl_mul_mmf(ccl_private float *res, ccl_private const float *a, float b)
+{
+  for (int i = 0; i < 16; ++i) {
+    res[i] = a[i] * b;
+  }
+}
+
+ccl_device_extern void osl_div_mmm(ccl_private float *res,
+                                   ccl_private const float *a,
+                                   ccl_private const float *b)
+{
+  const Transform tfm_a = convert_transform(a);
+  const Transform tfm_b = convert_transform(b);
+  copy_matrix(res, tfm_a * transform_inverse(tfm_b));
+}
+
+ccl_device_extern void osl_div_mmf(ccl_private float *res, ccl_private const float *a, float b)
+{
+  for (int i = 0; i < 16; ++i) {
+    res[i] = a[i] / b;
+  }
+}
+
+ccl_device_extern void osl_div_mfm(ccl_private float *res, float a, ccl_private const float *b)
+{
+  const Transform tfm_b = convert_transform(b);
+  copy_matrix(res, transform_inverse(tfm_b));
+  for (int i = 0; i < 16; ++i) {
+    res[i] *= a;
+  }
+}
+
+ccl_device_extern void osl_div_m_ff(ccl_private float *res, float a, float b)
+{
+  float f = (b == 0) ? 0.0f : (a / b);
+  res[0] = f;
+  res[1] = 0.0f;
+  res[2] = 0.0f;
+  res[3] = 0.0f;
+  res[4] = 0.0f;
+  res[5] = f;
+  res[6] = 0.0f;
+  res[7] = 0.0f;
+  res[8] = 0.0f;
+  res[9] = 0.0f;
+  res[10] = f;
+  res[11] = 0.0f;
+  res[12] = 0.0f;
+  res[13] = 0.0f;
+  res[14] = 0.0f;
+  res[15] = f;
+}
+
+ccl_device_extern void osl_transform_vmv(ccl_private float3 *res,
+                                         ccl_private const float *m,
+                                         ccl_private const float3 *v)
+{
+  const Transform tfm_m = convert_transform(m);
+  *res = transform_point(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transform_dvmdv(ccl_private float3 *res,
+                                           ccl_private const float *m,
+                                           ccl_private const float3 *v)
+{
+  for (int i = 0; i < 3; ++i) {
+    const Transform tfm_m = convert_transform(m + i * 16);
+    res[i] = transform_point(&tfm_m, v[i]);
+  }
+}
+
+ccl_device_extern void osl_transformv_vmv(ccl_private float3 *res,
+                                          ccl_private const float *m,
+                                          ccl_private const float3 *v)
+{
+  const Transform tfm_m = convert_transform(m);
+  *res = transform_direction(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transformv_dvmdv(ccl_private float3 *res,
+                                            ccl_private const float *m,
+                                            ccl_private const float3 *v)
+{
+  for (int i = 0; i < 3; ++i) {
+    const Transform tfm_m = convert_transform(m + i * 16);
+    res[i] = transform_direction(&tfm_m, v[i]);
+  }
+}
+
+ccl_device_extern void osl_transformn_vmv(ccl_private float3 *res,
+                                          ccl_private const float *m,
+                                          ccl_private const float3 *v)
+{
+  const Transform tfm_m = convert_transform(m);
+  *res = transform_direction(&tfm_m, *v);
+}
+
+ccl_device_extern void osl_transformn_dvmdv(ccl_private float3 *res,
+                                            ccl_private const float *m,
+                                            ccl_private const float3 *v)
+{
+  for (int i = 0; i < 3; ++i) {
+    const Transform tfm_m = convert_transform(m + i * 16);
+    res[i] = transform_direction(&tfm_m, v[i]);
+  }
+}
+
+ccl_device_extern bool osl_get_matrix(ccl_private ShaderGlobals *sg,
+                                      ccl_private float *result,
+                                      DeviceString from)
+{
+  if (from == DeviceStrings::u_ndc) {
+    copy_matrix(result, kernel_data.cam.ndctoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_raster) {
+    copy_matrix(result, kernel_data.cam.rastertoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_screen) {
+    copy_matrix(result, kernel_data.cam.screentoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_camera) {
+    copy_matrix(result, kernel_data.cam.cameratoworld);
+    return true;
+  }
+  if (from == DeviceStrings::u_world) {
+    copy_identity_matrix(result);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_extern bool osl_get_inverse_matrix(ccl_private ShaderGlobals *sg,
+                                              ccl_private float *res,
+                                              DeviceString to)
+{
+  if (to == DeviceStrings::u_ndc) {
+    copy_matrix(res, kernel_data.cam.worldtondc);
+    return true;
+  }
+  if (to == DeviceStrings::u_raster) {
+    copy_matrix(res, kernel_data.cam.worldtoraster);
+    return true;
+  }
+  if (to == DeviceStrings::u_screen) {
+    copy_matrix(res, kernel_data.cam.worldtoscreen);
+    return true;
+  }
+  if (to == DeviceStrings::u_camera) {
+    copy_matrix(res, kernel_data.cam.worldtocamera);
+    return true;
+  }
+  if (to == DeviceStrings::u_world) {
+    copy_identity_matrix(res);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_extern bool osl_get_from_to_matrix(ccl_private ShaderGlobals *sg,
+                                              ccl_private float *res,
+                                              DeviceString from,
+                                              DeviceString to)
+{
+  float m_from[16], m_to[16];
+  if (osl_get_matrix(sg, m_from, from) && osl_get_inverse_matrix(sg, m_to, to)) {
+    osl_mul_mmm(res, m_from, m_to);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_extern void osl_prepend_matrix_from(ccl_private ShaderGlobals *sg,
+                                               ccl_private float *res,
+                                               DeviceString from)
+{
+  float m[16];
+  if (osl_get_matrix(sg, m, from)) {
+    osl_mul_mmm(res, m, res);
+  }
+}
+
+ccl_device_extern bool osl_transform_triple(ccl_private ShaderGlobals *sg,
+                                            ccl_private float3 *p_in,
+                                            int p_in_derivs,
+                                            ccl_private float3 *p_out,
+                                            int p_out_derivs,
+                                            DeviceString from,
+                                            DeviceString to,
+                                            int vectype)
+{
+  if (!p_out_derivs) {
+    p_in_derivs = false;
+  }
+  else if (!p_in_derivs) {
+    p_out[1] = zero_float3();
+    p_out[2] = zero_float3();
+  }
+
+  bool res;
+  float m[16];
+
+  if (from == DeviceStrings::u_common) {
+    res = osl_get_inverse_matrix(sg, m, to);
+  }
+  else if (to == DeviceStrings::u_common) {
+    res = osl_get_matrix(sg, m, from);
+  }
+  else {
+    res = osl_get_from_to_matrix(sg, m, from, to);
+  }
+
+  if (res) {
+    if (vectype == 2 /* TypeDesc::POINT */) {
+      if (p_in_derivs)
+        osl_transform_dvmdv(p_out, m, p_in);
+      else
+        osl_transform_vmv(p_out, m, p_in);
+    }
+    else if (vectype == 3 /* TypeDesc::VECTOR */) {
+      if (p_in_derivs)
+        osl_transformv_dvmdv(p_out, m, p_in);
+      else
+        osl_transformv_vmv(p_out, m, p_in);
+    }
+    else if (vectype == 4 /* TypeDesc::NORMAL */) {
+      if (p_in_derivs)
+        osl_transformn_dvmdv(p_out, m, p_in);
+      else
+        osl_transformn_vmv(p_out, m, p_in);
+    }
+    else {
+      res = false;
+    }
+  }
+  else {
+    p_out[0] = p_in[0];
+    if (p_in_derivs) {
+      p_out[1] = p_in[1];
+      p_out[2] = p_in[2];
+    }
+  }
+
+  return res;
+}
+
+ccl_device_extern bool osl_transform_triple_nonlinear(ccl_private ShaderGlobals *sg,
+                                                      ccl_private float3 *p_in,
+                                                      int p_in_derivs,
+                                                      ccl_private float3 *p_out,
+                                                      int p_out_derivs,
+                                                      DeviceString from,
+                                                      DeviceString to,
+                                                      int vectype)
+{
+  return osl_transform_triple(sg, p_in, p_in_derivs, p_out, p_out_derivs, from, to, vectype);
+}
+
+ccl_device_extern void osl_transpose_mm(ccl_private float *res, ccl_private const float *m)
+{
+  copy_matrix(res, *reinterpret_cast<ccl_private const ProjectionTransform *>(m));
+}
+
+#if 0
+ccl_device_extern float osl_determinant_fm(ccl_private const float *m)
+{
+}
+#endif
+
+/* Attributes */
+
+#include "kernel/geom/geom.h"
+
+typedef long long TypeDesc;
+
+ccl_device_inline bool set_attribute_float(ccl_private float fval[3],
+                                           TypeDesc type,
+                                           bool derivatives,
+                                           ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 2 /* TypeDesc::VEC2 */) ||
+        (type_aggregate == 1 && type_arraylen == 2)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i];
+      }
+      return true;
+    }
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i];
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i];
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = fval[i];
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_float(float f,
+                                           TypeDesc type,
+                                           bool derivatives,
+                                           ccl_private void *val)
+{
+  float fv[3];
+
+  fv[0] = f;
+  fv[1] = 0.0f;
+  fv[2] = 0.0f;
+
+  return set_attribute_float(fv, type, derivatives, val);
+}
+ccl_device_inline bool set_attribute_float2(ccl_private float2 fval[3],
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 2 /* TypeDesc::VEC2 */) ||
+        (type_aggregate == 1 && type_arraylen == 2)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 2 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 2 + 1] = fval[i].y;
+      }
+      return true;
+    }
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = 0.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = 0.0f;
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = fval[i].x;
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_float3(ccl_private float3 fval[3],
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z;
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z;
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = 1.0f;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = average(fval[i]);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_float3(float3 f,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  float3 fv[3];
+
+  fv[0] = f;
+  fv[1] = make_float3(0.0f, 0.0f, 0.0f);
+  fv[2] = make_float3(0.0f, 0.0f, 0.0f);
+
+  return set_attribute_float3(fv, type, derivatives, val);
+}
+ccl_device_inline bool set_attribute_float4(ccl_private float4 fval[3],
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+  const int type_arraylen = type >> 32;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */) {
+    if ((type_aggregate == 3 /* TypeDesc::VEC3 */) ||
+        (type_aggregate == 1 && type_arraylen == 3)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 3 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 3 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 3 + 2] = fval[i].z;
+      }
+      return true;
+    }
+    if ((type_aggregate == 4 /* TypeDesc::VEC4 */) ||
+        (type_aggregate == 1 && type_arraylen == 4)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i * 4 + 0] = fval[i].x;
+        static_cast<ccl_private float *>(val)[i * 4 + 1] = fval[i].y;
+        static_cast<ccl_private float *>(val)[i * 4 + 2] = fval[i].z;
+        static_cast<ccl_private float *>(val)[i * 4 + 3] = fval[i].w;
+      }
+      return true;
+    }
+    if ((type_aggregate == 1 /* TypeDesc::SCALAR */)) {
+      for (int i = 0; i < (derivatives ? 3 : 1); ++i) {
+        static_cast<ccl_private float *>(val)[i] = average(float4_to_float3(fval[i]));
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+ccl_device_inline bool set_attribute_matrix(ccl_private const Transform &tfm,
+                                            TypeDesc type,
+                                            ccl_private void *val)
+{
+  const unsigned char type_basetype = type & 0xF;
+  const unsigned char type_aggregate = (type >> 8) & 0xF;
+
+  if (type_basetype == 11 /* TypeDesc::FLOAT */ && type_aggregate == 16 /* TypeDesc::MATRIX44 */) {
+    copy_matrix(static_cast<ccl_private float *>(val), tfm);
+    return true;
+  }
+
+  return false;
+}
+
+ccl_device_inline bool get_background_attribute(KernelGlobals kg,
+                                                ccl_private ShaderData *sd,
+                                                DeviceString name,
+                                                TypeDesc type,
+                                                bool derivatives,
+                                                ccl_private void *val)
+{
+  if (name == DeviceStrings::u_path_ray_length) {
+    /* Ray Length */
+    float f = sd->ray_length;
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+  return false;
+}
+
+ccl_device_inline bool get_object_attribute(KernelGlobals kg,
+                                            ccl_private ShaderData *sd,
+                                            const AttributeDescriptor &desc,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            ccl_private void *val)
+{
+  if (desc.type == NODE_ATTR_FLOAT) {
+    float fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      fval[0] = primitive_volume_attribute_float(kg, sd, desc);
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_FLOAT2) {
+    float2 fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      return false;
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float2(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float2(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_FLOAT3) {
+    float3 fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      fval[0] = primitive_volume_attribute_float3(kg, sd, desc);
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float3(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float3(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_FLOAT4 || desc.type == NODE_ATTR_RGBA) {
+    float4 fval[3];
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc))
+      fval[0] = primitive_volume_attribute_float4(kg, sd, desc);
+    else
+#endif
+      fval[0] = primitive_surface_attribute_float4(
+          kg, sd, desc, derivatives ? &fval[1] : nullptr, derivatives ? &fval[2] : nullptr);
+    return set_attribute_float4(fval, type, derivatives, val);
+  }
+  else if (desc.type == NODE_ATTR_MATRIX) {
+    Transform tfm = primitive_attribute_matrix(kg, desc);
+    return set_attribute_matrix(tfm, type, val);
+  }
+
+  return false;
+}
+
+ccl_device_inline bool get_object_standard_attribute(KernelGlobals kg,
+                                                     ccl_private ShaderData *sd,
+                                                     DeviceString name,
+                                                     TypeDesc type,
+                                                     bool derivatives,
+                                                     ccl_private void *val)
+{
+  /* Object attributes */
+  if (name == DeviceStrings::u_object_location) {
+    float3 f = object_location(kg, sd);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_color) {
+    float3 f = object_color(kg, sd->object);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_alpha) {
+    float f = object_alpha(kg, sd->object);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_index) {
+    float f = object_pass_id(kg, sd->object);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_geom_dupli_generated) {
+    float3 f = object_dupli_generated(kg, sd->object);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_geom_dupli_uv) {
+    float3 f = object_dupli_uv(kg, sd->object);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_material_index) {
+    float f = shader_pass_id(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_object_random) {
+    float f = object_random_number(kg, sd->object);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+  /* Particle attributes */
+  else if (name == DeviceStrings::u_particle_index) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_index(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_random) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = hash_uint2_to_float(particle_index(kg, particle_id), 0);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+  else if (name == DeviceStrings::u_particle_age) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_age(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_lifetime) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_lifetime(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_location) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float3 f = particle_location(kg, particle_id);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+#if 0 /* unsupported */
+  else if (name == DeviceStrings::u_particle_rotation) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float4 f = particle_rotation(kg, particle_id);
+    return set_attribute_float4(f, type, derivatives, val);
+  }
+#endif
+  else if (name == DeviceStrings::u_particle_size) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float f = particle_size(kg, particle_id);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_velocity) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float3 f = particle_velocity(kg, particle_id);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_particle_angular_velocity) {
+    int particle_id = object_particle_id(kg, sd->object);
+    float3 f = particle_angular_velocity(kg, particle_id);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+
+  /* Geometry attributes */
+#if 0 /* TODO */
+  else if (name == DeviceStrings::u_geom_numpolyvertices) {
+    return false;
+  }
+  else if (name == DeviceStrings::u_geom_trianglevertices ||
+            name == DeviceStrings::u_geom_polyvertices) {
+    return false;
+  }
+  else if (name == DeviceStrings::u_geom_name) {
+    return false;
+  }
+#endif
+  else if (name == DeviceStrings::u_is_smooth) {
+    float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+
+#ifdef __HAIR__
+  /* Hair attributes */
+  else if (name == DeviceStrings::u_is_curve) {
+    float f = (sd->type & PRIMITIVE_CURVE) != 0;
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_curve_thickness) {
+    float f = curve_thickness(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_curve_tangent_normal) {
+    float3 f = curve_tangent_normal(kg, sd);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_curve_random) {
+    float f = curve_random(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+#endif
+
+#ifdef __POINTCLOUD__
+  /* Point attributes */
+  else if (name == DeviceStrings::u_is_point) {
+    float f = (sd->type & PRIMITIVE_POINT) != 0;
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_point_radius) {
+    float f = point_radius(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_point_position) {
+    float3 f = point_position(kg, sd);
+    return set_attribute_float3(f, type, derivatives, val);
+  }
+  else if (name == DeviceStrings::u_point_random) {
+    float f = point_random(kg, sd);
+    return set_attribute_float(f, type, derivatives, val);
+  }
+#endif
+
+  else if (name == DeviceStrings::u_normal_map_normal) {
+    if (sd->type & PRIMITIVE_TRIANGLE) {
+      float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v);
+      return set_attribute_float3(f, type, derivatives, val);
+    }
+    else {
+      return false;
+    }
+  }
+
+  return get_background_attribute(kg, sd, name, type, derivatives, val);
+}
+
+ccl_device_extern bool osl_get_attribute(ccl_private ShaderGlobals *sg,
+                                         int derivatives,
+                                         DeviceString object_name,
+                                         DeviceString name,
+                                         int array_lookup,
+                                         int index,
+                                         TypeDesc type,
+                                         ccl_private void *res)
+{
+  KernelGlobals kg = nullptr;
+  ccl_private ShaderData *const sd = static_cast<ccl_private ShaderData *>(sg->renderstate);
+  int object;
+
+  if (object_name != DeviceStrings::_emptystring_) {
+    /* TODO: Get object index from name */
+    return false;
+  }
+  else {
+    object = sd->object;
+  }
+
+  const uint64_t id = name.hash();
+
+  const AttributeDescriptor desc = find_attribute(kg, object, sd->prim, sd->type, id);
+  if (desc.offset != ATTR_STD_NOT_FOUND) {
+    return get_object_attribute(kg, sd, desc, type, derivatives, res);
+  }
+  else {
+    return get_object_standard_attribute(kg, sd, name, type, derivatives, res);
+  }
+}
+
+#if 0
+ccl_device_extern bool osl_bind_interpolated_param(ccl_private ShaderGlobals *sg,
+                                                       DeviceString name,
+                                                       long long type,
+                                                       int userdata_has_derivs,
+                                                       ccl_private void *userdata_data,
+                                                       int symbol_has_derivs,
+                                                       ccl_private void *symbol_data,
+                                                       int symbol_data_size,
+                                                       ccl_private void *userdata_initialized,
+                                                       int userdata_index)
+{
+  return false;
+}
+#endif
+
+/* Noise */
+
+#include "kernel/svm/noise.h"
+#include "util/hash.h"
+
+ccl_device_extern uint osl_hash_ii(int x)
+{
+  return hash_uint(x);
+}
+
+ccl_device_extern uint osl_hash_if(float x)
+{
+  return hash_uint(__float_as_uint(x));
+}
+
+ccl_device_extern uint osl_hash_iff(float x, float y)
+{
+  return hash_uint2(__float_as_uint(x), __float_as_uint(y));
+}
+
+ccl_device_extern uint osl_hash_iv(ccl_private const float3 *v)
+{
+  return hash_uint3(__float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z));
+}
+
+ccl_device_extern uint osl_hash_ivf(ccl_private const float3 *v, float w)
+{
+  return hash_uint4(
+      __float_as_uint(v->x), __float_as_uint(v->y), __float_as_uint(v->z), __float_as_uint(w));
+}
+
+ccl_device_extern OSLNoiseOptions *osl_get_noise_options(ccl_private ShaderGlobals *sg)
+{
+  return nullptr;
+}
+
+ccl_device_extern void osl_noiseparams_set_anisotropic(ccl_private OSLNoiseOptions *opt,
+                                                       int anisotropic)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_do_filter(ccl_private OSLNoiseOptions *opt,
+                                                     int do_filter)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_direction(ccl_private OSLNoiseOptions *opt,
+                                                     float3 *direction)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_bandwidth(ccl_private OSLNoiseOptions *opt,
+                                                     float bandwidth)
+{
+}
+
+ccl_device_extern void osl_noiseparams_set_impulses(ccl_private OSLNoiseOptions *opt,
+                                                    float impulses)
+{
+}
+
+#define OSL_NOISE_IMPL(name, op) \
+  ccl_device_extern float name##_ff(float x) \
+  { \
+    return op##_1d(x); \
+  } \
+  ccl_device_extern float name##_fff(float x, float y) \
+  { \
+    return op##_2d(make_float2(x, y)); \
+  } \
+  ccl_device_extern float name##_fv(ccl_private const float3 *v) \
+  { \
+    return op##_3d(*v); \
+  } \
+  ccl_device_extern float name##_fvf(ccl_private const float3 *v, float w) \
+  { \
+    return op##_4d(make_float4(v->x, v->y, v->z, w)); \
+  } \
+  ccl_device_extern void name##_vf(ccl_private float3 *res, float x) \
+  { \
+    /* TODO: This is not correct. Really need to change the hash function inside the noise \
+     * function to spit out a vector instead of a scalar. */ \
+    const float n = name##_ff(x); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  } \
+  ccl_device_extern void name##_vff(ccl_private float3 *res, float x, float y) \
+  { \
+    const float n = name##_fff(x, y); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  } \
+  ccl_device_extern void name##_vv(ccl_private float3 *res, const float3 *v) \
+  { \
+    const float n = name##_fv(v); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  } \
+  ccl_device_extern void name##_vvf(ccl_private float3 *res, const float3 *v, float w) \
+  { \
+    const float n = name##_fvf(v, w); \
+    res->x = n; \
+    res->y = n; \
+    res->z = n; \
+  }
+
+ccl_device_forceinline float hashnoise_1d(float p)
+{
+  const uint x = __float_as_uint(p);
+  return hash_uint(x) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_2d(float2 p)
+{
+  const uint x = __float_as_uint(p.x);
+  const uint y = __float_as_uint(p.y);
+  return hash_uint2(x, y) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_3d(float3 p)
+{
+  const uint x = __float_as_uint(p.x);
+  const uint y = __float_as_uint(p.y);
+  const uint z = __float_as_uint(p.z);
+  return hash_uint3(x, y, z) / static_cast<float>(~0u);
+}
+ccl_device_forceinline float hashnoise_4d(float4 p)
+{
+  const uint x = __float_as_uint(p.x);
+  const uint y = __float_as_uint(p.y);
+  const uint z = __float_as_uint(p.z);
+  const uint w = __float_as_uint(p.w);
+  return hash_uint4(x, y, z, w) / static_cast<float>(~0u);
+}
+
+/* TODO: Implement all noise functions */
+OSL_NOISE_IMPL(osl_hashnoise, hashnoise)
+OSL_NOISE_IMPL(osl_noise, noise)
+OSL_NOISE_IMPL(osl_snoise, snoise)
+
+/* Texturing */
+
+ccl_device_extern ccl_private OSLTextureOptions *osl_get_texture_options(
+    ccl_private ShaderGlobals *sg)
+{
+  return nullptr;
+}
+
+ccl_device_extern void osl_texture_set_firstchannel(ccl_private OSLTextureOptions *opt,
+                                                    int firstchannel)
+{
+}
+
+ccl_device_extern void osl_texture_set_swrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_twrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_rwrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_stwrap_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_sblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_tblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_rblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_stblur(ccl_private OSLTextureOptions *opt, float blur)
+{
+}
+
+ccl_device_extern void osl_texture_set_swidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_twidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_rwidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_stwidth(ccl_private OSLTextureOptions *opt, float width)
+{
+}
+
+ccl_device_extern void osl_texture_set_fill(ccl_private OSLTextureOptions *opt, float fill)
+{
+}
+
+ccl_device_extern void osl_texture_set_time(ccl_private OSLTextureOptions *opt, float time)
+{
+}
+
+ccl_device_extern void osl_texture_set_interp_code(ccl_private OSLTextureOptions *opt, int mode)
+{
+}
+
+ccl_device_extern void osl_texture_set_subimage(ccl_private OSLTextureOptions *opt, int subimage)
+{
+}
+
+ccl_device_extern void osl_texture_set_missingcolor_arena(ccl_private OSLTextureOptions *opt,
+                                                          ccl_private float3 *color)
+{
+}
+
+ccl_device_extern void osl_texture_set_missingcolor_alpha(ccl_private OSLTextureOptions *opt,
+                                                          int nchannels,
+                                                          float alpha)
+{
+}
+
+ccl_device_extern bool osl_texture(ccl_private ShaderGlobals *sg,
+                                   DeviceString filename,
+                                   ccl_private void *texture_handle,
+                                   OSLTextureOptions *opt,
+                                   float s,
+                                   float t,
+                                   float dsdx,
+                                   float dtdx,
+                                   float dsdy,
+                                   float dtdy,
+                                   int nchannels,
+                                   ccl_private float *result,
+                                   ccl_private float *dresultdx,
+                                   ccl_private float *dresultdy,
+                                   ccl_private float *alpha,
+                                   ccl_private float *dalphadx,
+                                   ccl_private float *dalphady,
+                                   ccl_private void *errormessage)
+{
+  if (!texture_handle) {
+    return false;
+  }
+
+  /* Only SVM textures are supported. */
+  int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1);
+
+  const float4 rgba = kernel_tex_image_interp(nullptr, id, s, 1.0f - t);
+
+  result[0] = rgba.x;
+  if (nchannels > 1)
+    result[1] = rgba.y;
+  if (nchannels > 2)
+    result[2] = rgba.z;
+  if (nchannels > 3)
+    result[3] = rgba.w;
+
+  return true;
+}
+
+ccl_device_extern bool osl_texture3d(ccl_private ShaderGlobals *sg,
+                                     DeviceString filename,
+                                     ccl_private void *texture_handle,
+                                     OSLTextureOptions *opt,
+                                     ccl_private const float3 *P,
+                                     ccl_private const float3 *dPdx,
+                                     ccl_private const float3 *dPdy,
+                                     ccl_private const float3 *dPdz,
+                                     int nchannels,
+                                     ccl_private float *result,
+                                     ccl_private float *dresultds,
+                                     ccl_private float *dresultdt,
+                                     ccl_private float *alpha,
+                                     ccl_private float *dalphadx,
+                                     ccl_private float *dalphady,
+                                     ccl_private void *errormessage)
+{
+  if (!texture_handle) {
+    return false;
+  }
+
+  /* Only SVM textures are supported. */
+  int id = static_cast<int>(reinterpret_cast<size_t>(texture_handle) - 1);
+
+  const float4 rgba = kernel_tex_image_interp_3d(nullptr, id, *P, INTERPOLATION_NONE);
+
+  result[0] = rgba.x;
+  if (nchannels > 1)
+    result[1] = rgba.y;
+  if (nchannels > 2)
+    result[2] = rgba.z;
+  if (nchannels > 3)
+    result[3] = rgba.w;
+
+  return true;
+}
+
+ccl_device_extern bool osl_environment(ccl_private ShaderGlobals *sg,
+                                       DeviceString filename,
+                                       ccl_private void *texture_handle,
+                                       OSLTextureOptions *opt,
+                                       ccl_private const float3 *R,
+                                       ccl_private const float3 *dRdx,
+                                       ccl_private const float3 *dRdy,
+                                       int nchannels,
+                                       ccl_private float *result,
+                                       ccl_private float *dresultds,
+                                       ccl_private float *dresultdt,
+                                       ccl_private float *alpha,
+                                       ccl_private float *dalphax,
+                                       ccl_private float *dalphay,
+                                       ccl_private void *errormessage)
+{
+  result[0] = 1.0f;
+  if (nchannels > 1)
+    result[1] = 0.0f;
+  if (nchannels > 2)
+    result[2] = 1.0f;
+  if (nchannels > 3)
+    result[3] = 1.0f;
+
+  return false;
+}
+
+ccl_device_extern bool osl_get_textureinfo(ccl_private ShaderGlobals *sg,
+                                           DeviceString filename,
+                                           ccl_private void *texture_handle,
+                                           DeviceString dataname,
+                                           int basetype,
+                                           int arraylen,
+                                           int aggegrate,
+                                           ccl_private void *data,
+                                           ccl_private void *errormessage)
+{
+  return false;
+}
+
+ccl_device_extern bool osl_get_textureinfo_st(ccl_private ShaderGlobals *sg,
+                                              DeviceString filename,
+                                              ccl_private void *texture_handle,
+                                              float s,
+                                              float t,
+                                              DeviceString dataname,
+                                              int basetype,
+                                              int arraylen,
+                                              int aggegrate,
+                                              ccl_private void *data,
+                                              ccl_private void *errormessage)
+{
+  return osl_get_textureinfo(
+      sg, filename, texture_handle, dataname, basetype, arraylen, aggegrate, data, errormessage);
+}
+
+/* Standard library */
+
+#define OSL_OP_IMPL_II(name, op) \
+  ccl_device_extern int name##_ii(int a) \
+  { \
+    return op(a); \
+  }
+#define OSL_OP_IMPL_IF(name, op) \
+  ccl_device_extern int name##_if(float a) \
+  { \
+    return op(a); \
+  }
+#define OSL_OP_IMPL_FF(name, op) \
+  ccl_device_extern float name##_ff(float a) \
+  { \
+    return op(a); \
+  }
+#define OSL_OP_IMPL_DFDF(name, op) \
+  ccl_device_extern void name##_dfdf(ccl_private float *res, ccl_private const float *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDV(name, op) \
+  ccl_device_extern void name##_dfdv(ccl_private float *res, ccl_private const float3 *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_FV(name, op) \
+  ccl_device_extern float name##_fv(ccl_private const float3 *a) \
+  { \
+    return op(*a); \
+  }
+#define OSL_OP_IMPL_VV(name, op) \
+  ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    *res = op(*a); \
+  }
+#define OSL_OP_IMPL_VV_(name, op) \
+  ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    res->x = op(a->x); \
+    res->y = op(a->y); \
+    res->z = op(a->z); \
+  }
+#define OSL_OP_IMPL_DVDV(name, op) \
+  ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDV_(name, op) \
+  ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *a) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x); \
+      res[i].y = op(a[i].y); \
+      res[i].z = op(a[i].z); \
+    } \
+  }
+
+#define OSL_OP_IMPL_III(name, op) \
+  ccl_device_extern int name##_iii(int a, int b) \
+  { \
+    return op(a, b); \
+  }
+#define OSL_OP_IMPL_FFF(name, op) \
+  ccl_device_extern float name##_fff(float a, float b) \
+  { \
+    return op(a, b); \
+  }
+#define OSL_OP_IMPL_FVV(name, op) \
+  ccl_device_extern float name##_fvv(ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    return op(*a, *b); \
+  }
+#define OSL_OP_IMPL_DFFDF(name, op) \
+  ccl_device_extern void name##_dffdf( \
+      ccl_private float *res, float a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFF(name, op) \
+  ccl_device_extern void name##_dfdff( \
+      ccl_private float *res, ccl_private const float *a, float b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFDF(name, op) \
+  ccl_device_extern void name##_dfdfdf( \
+      ccl_private float *res, ccl_private const float *a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFVDV(name, op) \
+  ccl_device_extern void name##_dfvdv( \
+      ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[0], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDVV(name, op) \
+  ccl_device_extern void name##_dfdvv( \
+      ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[0]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDVDV(name, op) \
+  ccl_device_extern void name##_dfdvdv( \
+      ccl_private float *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_VVF_(name, op) \
+  ccl_device_extern void name##_vvf( \
+      ccl_private float3 *res, ccl_private const float3 *a, float b) \
+  { \
+    res->x = op(a->x, b); \
+    res->y = op(a->y, b); \
+    res->z = op(a->z, b); \
+  }
+#define OSL_OP_IMPL_VVV(name, op) \
+  ccl_device_extern void name##_vvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    *res = op(*a, *b); \
+  }
+#define OSL_OP_IMPL_VVV_(name, op) \
+  ccl_device_extern void name##_vvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    res->x = op(a->x, b->x); \
+    res->y = op(a->y, b->y); \
+    res->z = op(a->z, b->z); \
+  }
+#define OSL_OP_IMPL_DVVDF_(name, op) \
+  ccl_device_extern void name##_dvvdf( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[0].x, b[i]); \
+      res[i].y = op(a[0].y, b[i]); \
+      res[i].z = op(a[0].z, b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVF_(name, op) \
+  ccl_device_extern void name##_dvdvf( \
+      ccl_private float3 *res, ccl_private const float3 *a, float b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b); \
+      res[i].y = op(a[i].y, b); \
+      res[i].z = op(a[i].z, b); \
+    } \
+  }
+#define OSL_OP_IMPL_DVVDV(name, op) \
+  ccl_device_extern void name##_dvvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[0], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVVDV_(name, op) \
+  ccl_device_extern void name##_dvvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[0].x, b[i].x); \
+      res[i].y = op(a[0].y, b[i].y); \
+      res[i].z = op(a[0].z, b[i].z); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVV(name, op) \
+  ccl_device_extern void name##_dvdvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[0]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVV_(name, op) \
+  ccl_device_extern void name##_dvdvv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b[0].x); \
+      res[i].y = op(a[i].y, b[0].y); \
+      res[i].z = op(a[i].z, b[0].z); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVDF_(name, op) \
+  ccl_device_extern void name##_dvdvdf( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b[i]); \
+      res[i].y = op(a[i].y, b[i]); \
+      res[i].z = op(a[i].z, b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVDV(name, op) \
+  ccl_device_extern void name##_dvdvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DVDVDV_(name, op) \
+  ccl_device_extern void name##_dvdvdv( \
+      ccl_private float3 *res, ccl_private const float3 *a, ccl_private const float3 *b) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i].x = op(a[i].x, b[i].x); \
+      res[i].y = op(a[i].y, b[i].y); \
+      res[i].z = op(a[i].z, b[i].z); \
+    } \
+  }
+
+#define OSL_OP_IMPL_FFFF(name, op) \
+  ccl_device_extern float name##_ffff(float a, float b, float c) \
+  { \
+    return op(a, b, c); \
+  }
+#define OSL_OP_IMPL_DFFFDF(name, op) \
+  ccl_device_extern void name##_dfffdf( \
+      ccl_private float *res, float a, float b, ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b, c[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFFDFF(name, op) \
+  ccl_device_extern void name##_dffdff( \
+      ccl_private float *res, float a, ccl_private const float *b, float c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b[i], c); \
+    } \
+  }
+#define OSL_OP_IMPL_DFFDFDF(name, op) \
+  ccl_device_extern void name##_dffdfdf( \
+      ccl_private float *res, float a, ccl_private const float *b, ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a, b[i], c[i]); \
+    } \
+  }
+
+#define OSL_OP_IMPL_DFDFFF(name, op) \
+  ccl_device_extern void name##_dfdfff( \
+      ccl_private float *res, ccl_private const float *a, float b, float c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b, c); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFFDF(name, op) \
+  ccl_device_extern void name##_dfdffdf( \
+      ccl_private float *res, ccl_private const float *a, float b, ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b, c[i]); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFDFF(name, op) \
+  ccl_device_extern void name##_dfdfdff( \
+      ccl_private float *res, ccl_private const float *a, ccl_private const float *b, float c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i], c); \
+    } \
+  }
+#define OSL_OP_IMPL_DFDFDFDF(name, op) \
+  ccl_device_extern void name##_dfdfdfdf(ccl_private float *res, \
+                                         ccl_private const float *a, \
+                                         ccl_private const float *b, \
+                                         ccl_private const float *c) \
+  { \
+    for (int i = 0; i < 3; ++i) { \
+      res[i] = op(a[i], b[i], c[i]); \
+    } \
+  }
+
+#define OSL_OP_IMPL_XX(name, op) \
+  OSL_OP_IMPL_FF(name, op) \
+  OSL_OP_IMPL_DFDF(name, op) \
+  OSL_OP_IMPL_VV_(name, op) \
+  OSL_OP_IMPL_DVDV_(name, op)
+
+#define OSL_OP_IMPL_XXX(name, op) \
+  OSL_OP_IMPL_FFF(name, op) \
+  OSL_OP_IMPL_DFFDF(name, op) \
+  OSL_OP_IMPL_DFDFF(name, op) \
+  OSL_OP_IMPL_DFDFDF(name, op) \
+  OSL_OP_IMPL_VVV_(name, op) \
+  OSL_OP_IMPL_DVVDV_(name, op) \
+  OSL_OP_IMPL_DVDVV_(name, op) \
+  OSL_OP_IMPL_DVDVDV_(name, op)
+
+OSL_OP_IMPL_XX(osl_acos, acosf)
+OSL_OP_IMPL_XX(osl_asin, asinf)
+OSL_OP_IMPL_XX(osl_atan, atanf)
+OSL_OP_IMPL_XXX(osl_atan2, atan2f)
+OSL_OP_IMPL_XX(osl_cos, cosf)
+OSL_OP_IMPL_XX(osl_sin, sinf)
+OSL_OP_IMPL_XX(osl_tan, tanf)
+OSL_OP_IMPL_XX(osl_cosh, coshf)
+OSL_OP_IMPL_XX(osl_sinh, sinhf)
+OSL_OP_IMPL_XX(osl_tanh, tanhf)
+
+ccl_device_forceinline int safe_divide(int a, int b)
+{
+  return (b != 0) ? a / b : 0;
+}
+ccl_device_forceinline int safe_modulo(int a, int b)
+{
+  return (b != 0) ? a % b : 0;
+}
+
+OSL_OP_IMPL_III(osl_safe_div, safe_divide)
+OSL_OP_IMPL_FFF(osl_safe_div, safe_divide)
+OSL_OP_IMPL_III(osl_safe_mod, safe_modulo)
+
+ccl_device_extern void osl_sincos_fff(float a, ccl_private float *b, ccl_private float *c)
+{
+  sincos(a, b, c);
+}
+ccl_device_extern void osl_sincos_dfdff(ccl_private const float *a,
+                                        ccl_private float *b,
+                                        ccl_private float *c)
+{
+  for (int i = 0; i < 3; ++i)
+    sincos(a[i], b + i, c);
+}
+ccl_device_extern void osl_sincos_dffdf(ccl_private const float *a,
+                                        ccl_private float *b,
+                                        ccl_private float *c)
+{
+  for (int i = 0; i < 3; ++i)
+    sincos(a[i], b, c + i);
+}
+ccl_device_extern void osl_sincos_dfdfdf(ccl_private const float *a,
+                                         ccl_private float *b,
+                                         ccl_private float *c)
+{
+  for (int i = 0; i < 3; ++i)
+    sincos(a[i], b + i, c + i);
+}
+ccl_device_extern void osl_sincos_vvv(ccl_private const float3 *a,
+                                      ccl_private float3 *b,
+                                      ccl_private float3 *c)
+{
+  sincos(a->x, &b->x, &c->x);
+  sincos(a->y, &b->y, &c->y);
+  sincos(a->z, &b->z, &c->z);
+}
+ccl_device_extern void osl_sincos_dvdvv(ccl_private const float3 *a,
+                                        ccl_private float3 *b,
+                                        ccl_private float3 *c)
+{
+  for (int i = 0; i < 3; ++i) {
+    sincos(a[i].x, &b[i].x, &c->x);
+    sincos(a[i].y, &b[i].y, &c->y);
+    sincos(a[i].z, &b[i].z, &c->z);
+  }
+}
+ccl_device_extern void osl_sincos_dvvdv(ccl_private const float3 *a,
+                                        ccl_private float3 *b,
+                                        ccl_private float3 *c)
+{
+  for (int i = 0; i < 3; ++i) {
+    sincos(a[i].x, &b->x, &c[i].x);
+    sincos(a[i].y, &b->y, &c[i].y);
+    sincos(a[i].z, &b->z, &c[i].z);
+  }
+}
+ccl_device_extern void osl_sincos_dvdvdv(ccl_private const float3 *a,
+                                         ccl_private float3 *b,
+                                         ccl_private float3 *c)
+{
+  for (int i = 0; i < 3; ++i) {
+    sincos(a[i].x, &b[i].x, &c[i].x);
+    sincos(a[i].y, &b[i].y, &c[i].y);
+    sincos(a[i].z, &b[i].z, &c[i].z);
+  }
+}
+
+OSL_OP_IMPL_XX(osl_log, logf)
+OSL_OP_IMPL_XX(osl_log2, log2f)
+OSL_OP_IMPL_XX(osl_log10, log10f)
+OSL_OP_IMPL_XX(osl_exp, expf)
+OSL_OP_IMPL_XX(osl_exp2, exp2f)
+OSL_OP_IMPL_XX(osl_expm1, expm1f)
+OSL_OP_IMPL_XX(osl_erf, erff)
+OSL_OP_IMPL_XX(osl_erfc, erfcf)
+
+OSL_OP_IMPL_XXX(osl_pow, safe_powf)
+OSL_OP_IMPL_VVF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVVDF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVDVF_(osl_pow, safe_powf)
+OSL_OP_IMPL_DVDVDF_(osl_pow, safe_powf)
+
+OSL_OP_IMPL_XX(osl_sqrt, sqrtf)
+OSL_OP_IMPL_XX(osl_inversesqrt, 1.0f / sqrtf)
+OSL_OP_IMPL_XX(osl_cbrt, cbrtf)
+
+OSL_OP_IMPL_FF(osl_logb, logbf)
+OSL_OP_IMPL_VV_(osl_logb, logbf)
+
+OSL_OP_IMPL_FF(osl_floor, floorf)
+OSL_OP_IMPL_VV_(osl_floor, floorf)
+OSL_OP_IMPL_FF(osl_ceil, ceilf)
+OSL_OP_IMPL_VV_(osl_ceil, ceilf)
+OSL_OP_IMPL_FF(osl_round, roundf)
+OSL_OP_IMPL_VV_(osl_round, roundf)
+OSL_OP_IMPL_FF(osl_trunc, truncf)
+OSL_OP_IMPL_VV_(osl_trunc, truncf)
+
+ccl_device_forceinline float step_impl(float edge, float x)
+{
+  return x < edge ? 0.0f : 1.0f;
+}
+
+OSL_OP_IMPL_FF(osl_sign, compatible_signf)
+OSL_OP_IMPL_VV_(osl_sign, compatible_signf)
+OSL_OP_IMPL_FFF(osl_step, step_impl)
+OSL_OP_IMPL_VVV_(osl_step, step_impl)
+
+OSL_OP_IMPL_IF(osl_isnan, isnan)
+OSL_OP_IMPL_IF(osl_isinf, isinf)
+OSL_OP_IMPL_IF(osl_isfinite, isfinite)
+
+OSL_OP_IMPL_II(osl_abs, abs)
+OSL_OP_IMPL_XX(osl_abs, fabsf)
+OSL_OP_IMPL_II(osl_fabs, abs)
+OSL_OP_IMPL_XX(osl_fabs, fabsf)
+OSL_OP_IMPL_XXX(osl_fmod, safe_modulo)
+
+OSL_OP_IMPL_FFFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFDFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFFDFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFFDF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFDFF(osl_smoothstep, smoothstep)
+OSL_OP_IMPL_DFDFDFDF(osl_smoothstep, smoothstep)
+
+OSL_OP_IMPL_FVV(osl_dot, dot)
+OSL_OP_IMPL_DFDVV(osl_dot, dot)
+OSL_OP_IMPL_DFVDV(osl_dot, dot)
+OSL_OP_IMPL_DFDVDV(osl_dot, dot)
+OSL_OP_IMPL_VVV(osl_cross, cross)
+OSL_OP_IMPL_DVDVV(osl_cross, cross)
+OSL_OP_IMPL_DVVDV(osl_cross, cross)
+OSL_OP_IMPL_DVDVDV(osl_cross, cross)
+OSL_OP_IMPL_FV(osl_length, len)
+OSL_OP_IMPL_DFDV(osl_length, len)
+OSL_OP_IMPL_FVV(osl_distance, distance)
+OSL_OP_IMPL_DFDVV(osl_distance, distance)
+OSL_OP_IMPL_DFVDV(osl_distance, distance)
+OSL_OP_IMPL_DFDVDV(osl_distance, distance)
+OSL_OP_IMPL_VV(osl_normalize, safe_normalize)
+OSL_OP_IMPL_DVDV(osl_normalize, safe_normalize)
+
+ccl_device_extern void osl_calculatenormal(ccl_private float3 *res,
+                                           ccl_private ShaderGlobals *sg,
+                                           ccl_private const float3 *p)
+{
+  if (sg->flipHandedness)
+    *res = cross(p[2], p[1]);
+  else
+    *res = cross(p[1], p[2]);
+}
+
+ccl_device_extern float osl_area(ccl_private const float3 *p)
+{
+  return len(cross(p[2], p[1]));
+}
+
+ccl_device_extern float osl_filterwidth_fdf(ccl_private const float *x)
+{
+  return sqrtf(x[1] * x[1] + x[2] * x[2]);
+}
+
+ccl_device_extern void osl_filterwidth_vdv(ccl_private float *res, ccl_private const float *x)
+{
+  for (int i = 0; i < 3; ++i)
+    res[i] = osl_filterwidth_fdf(x + i);
+}
+
+ccl_device_extern bool osl_raytype_bit(ccl_private ShaderGlobals *sg, int bit)
+{
+  return (sg->raytype & bit) != 0;
+}
diff --git a/intern/cycles/kernel/osl/services_optix.cu b/intern/cycles/kernel/osl/services_optix.cu
new file mode 100644
index 00000000000..2a43a89a956
--- /dev/null
+++ b/intern/cycles/kernel/osl/services_optix.cu
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+// clang-format off
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
+
+#include "kernel/osl/services_gpu.h"
+// clang-format on
+
+extern "C" __device__ void __direct_callable__dummy_services()
+{
+}
diff --git a/intern/cycles/kernel/osl/types.h b/intern/cycles/kernel/osl/types.h
index 46e06114360..717306a3d07 100644
--- a/intern/cycles/kernel/osl/types.h
+++ b/intern/cycles/kernel/osl/types.h
@@ -5,9 +5,53 @@
 
 CCL_NAMESPACE_BEGIN
 
+struct DeviceString {
+#if defined(__KERNEL_GPU__)
+  /* Strings are represented by their hashes in CUDA and OptiX. */
+  size_t str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_;
+  }
+#elif defined(OPENIMAGEIO_USTRING_H)
+  ustring str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_.hash();
+  }
+#else
+  const char *str_;
+#endif
+
+  ccl_device_inline_method bool operator==(DeviceString b) const
+  {
+    return str_ == b.str_;
+  }
+  ccl_device_inline_method bool operator!=(DeviceString b) const
+  {
+    return str_ != b.str_;
+  }
+};
+
+ccl_device_inline DeviceString make_string(const char *str, size_t hash)
+{
+#if defined(__KERNEL_GPU__)
+  (void)str;
+  return {hash};
+#elif defined(OPENIMAGEIO_USTRING_H)
+  (void)hash;
+  return {ustring(str)};
+#else
+  (void)hash;
+  return {str};
+#endif
+}
+
 /* Closure */
 
-enum ClosureTypeOSL {
+enum OSLClosureType {
   OSL_CLOSURE_MUL_ID = -1,
   OSL_CLOSURE_ADD_ID = -2,
 
@@ -17,4 +61,60 @@ enum ClosureTypeOSL {
 #include "closures_template.h"
 };
 
+struct OSLClosure {
+  OSLClosureType id;
+};
+
+struct ccl_align(8) OSLClosureMul : public OSLClosure
+{
+  packed_float3 weight;
+  ccl_private const OSLClosure *closure;
+};
+
+struct ccl_align(8) OSLClosureAdd : public OSLClosure
+{
+  ccl_private const OSLClosure *closureA;
+  ccl_private const OSLClosure *closureB;
+};
+
+struct ccl_align(8) OSLClosureComponent : public OSLClosure
+{
+  packed_float3 weight;
+};
+
+/* Globals */
+
+struct ShaderGlobals {
+  packed_float3 P, dPdx, dPdy;
+  packed_float3 dPdz;
+  packed_float3 I, dIdx, dIdy;
+  packed_float3 N;
+  packed_float3 Ng;
+  float u, dudx, dudy;
+  float v, dvdx, dvdy;
+  packed_float3 dPdu, dPdv;
+  float time;
+  float dtime;
+  packed_float3 dPdtime;
+  packed_float3 Ps, dPsdx, dPsdy;
+  ccl_private void *renderstate;
+  ccl_private void *tracedata;
+  ccl_private void *objdata;
+  void *context;
+  void *renderer;
+  ccl_private void *object2common;
+  ccl_private void *shader2common;
+  ccl_private OSLClosure *Ci;
+  float surfacearea;
+  int raytype;
+  int flipHandedness;
+  int backfacing;
+};
+
+struct OSLNoiseOptions {
+};
+
+struct OSLTextureOptions {
+};
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index 24c5a6a4540..a6f8914a9b8 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -75,10 +75,14 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__
 
 /* Device specific features */
-#ifndef __KERNEL_GPU__
-#  ifdef WITH_OSL
-#    define __OSL__
+#ifdef WITH_OSL
+#  define __OSL__
+#  ifdef __KERNEL_OPTIX__
+/* Kernels with OSL support are built separately in OptiX and don't need SVM. */
+#    undef __SVM__
 #  endif
+#endif
+#ifndef __KERNEL_GPU__
 #  ifdef WITH_PATH_GUIDING
 #    define __PATH_GUIDING__
 #  endif
@@ -917,9 +921,13 @@ typedef struct ccl_align(16) ShaderData
   float ray_dP;
 
 #ifdef __OSL__
+#  ifdef __KERNEL_GPU__
+  ccl_private uint8_t *osl_closure_pool;
+#  else
   const struct KernelGlobalsCPU *osl_globals;
   const struct IntegratorStateCPU *osl_path_state;
   const struct IntegratorShadowStateCPU *osl_shadow_path_state;
+#  endif
 #endif
 
   /* LCG state for closures that require additional random numbers. */
@@ -1529,6 +1537,9 @@ enum KernelFeatureFlag : uint32_t {
 
   /* Path guiding. */
   KERNEL_FEATURE_PATH_GUIDING = (1U << 26U),
+
+  /* OSL. */
+  KERNEL_FEATURE_OSL = (1U << 27U),
 };
 
 /* Shader node feature mask, to specialize shader evaluation for kernels. */
diff --git a/intern/cycles/scene/osl.cpp b/intern/cycles/scene/osl.cpp
index 93839facdbe..3ea406b6935 100644
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -38,16 +38,17 @@ OSL::TextureSystem *OSLShaderManager::ts_shared = NULL;
 int OSLShaderManager::ts_shared_users = 0;
 thread_mutex OSLShaderManager::ts_shared_mutex;
 
-OSL::ShadingSystem *OSLShaderManager::ss_shared = NULL;
-OSLRenderServices *OSLShaderManager::services_shared = NULL;
+OSL::ErrorHandler OSLShaderManager::errhandler;
+map<int, OSL::ShadingSystem *> OSLShaderManager::ss_shared;
 int OSLShaderManager::ss_shared_users = 0;
 thread_mutex OSLShaderManager::ss_shared_mutex;
 thread_mutex OSLShaderManager::ss_mutex;
+
 int OSLCompiler::texture_shared_unique_id = 0;
 
 /* Shader Manager */
 
-OSLShaderManager::OSLShaderManager()
+OSLShaderManager::OSLShaderManager(Device *device) : device_(device)
 {
   texture_system_init();
   shading_system_init();
@@ -107,11 +108,12 @@ void OSLShaderManager::device_update_specific(Device *device,
 
   device_free(device, dscene, scene);
 
-  /* set texture system */
-  scene->image_manager->set_osl_texture_system((void *)ts);
+  /* set texture system (only on CPU devices, since GPU devices cannot use OIIO) */
+  if (device->info.type == DEVICE_CPU) {
+    scene->image_manager->set_osl_texture_system((void *)ts_shared);
+  }
 
   /* create shaders */
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
   Shader *background_shader = scene->background->get_shader(scene);
 
   foreach (Shader *shader, scene->shaders) {
@@ -125,22 +127,34 @@ void OSLShaderManager::device_update_specific(Device *device,
      * compile shaders alternating */
     thread_scoped_lock lock(ss_mutex);
 
-    OSLCompiler compiler(this, services, ss, scene);
-    compiler.background = (shader == background_shader);
-    compiler.compile(og, shader);
+    device->foreach_device(
+        [this, scene, shader, background = (shader == background_shader)](Device *sub_device) {
+          OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+          OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+          OSLCompiler compiler(this, ss, scene);
+          compiler.background = background;
+          compiler.compile(og, shader);
+        });
 
     if (shader->get_use_mis() && shader->has_surface_emission)
       scene->light_manager->tag_update(scene, LightManager::SHADER_COMPILED);
   }
 
   /* setup shader engine */
-  og->ss = ss;
-  og->ts = ts;
-  og->services = services;
-
   int background_id = scene->shader_manager->get_shader_id(background_shader);
-  og->background_state = og->surface_state[background_id & SHADER_MASK];
-  og->use = true;
+
+  device->foreach_device([background_id](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+    og->ss = ss;
+    og->ts = ts_shared;
+    og->services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    og->background_state = og->surface_state[background_id & SHADER_MASK];
+    og->use = true;
+  });
 
   foreach (Shader *shader, scene->shaders)
     shader->clear_modified();
@@ -148,8 +162,12 @@ void OSLShaderManager::device_update_specific(Device *device,
   update_flags = UPDATE_NONE;
 
   /* add special builtin texture types */
-  services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
-  services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  for (const auto &[device_type, ss] : ss_shared) {
+    OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
+    services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  }
 
   device_update_common(device, dscene, scene, progress);
 
@@ -166,26 +184,35 @@ void OSLShaderManager::device_update_specific(Device *device,
      * is being freed after the Session is freed.
      */
     thread_scoped_lock lock(ss_shared_mutex);
-    ss->optimize_all_groups();
+    for (const auto &[device_type, ss] : ss_shared) {
+      ss->optimize_all_groups();
+    }
+  }
+
+  /* load kernels */
+  if (!device->load_osl_kernels()) {
+    progress.set_error(device->error_message());
   }
 }
 
 void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
 {
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
-
   device_free_common(device, dscene, scene);
 
   /* clear shader engine */
-  og->use = false;
-  og->ss = NULL;
-  og->ts = NULL;
-
-  og->surface_state.clear();
-  og->volume_state.clear();
-  og->displacement_state.clear();
-  og->bump_state.clear();
-  og->background_state.reset();
+  device->foreach_device([](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+
+    og->use = false;
+    og->ss = NULL;
+    og->ts = NULL;
+
+    og->surface_state.clear();
+    og->volume_state.clear();
+    og->displacement_state.clear();
+    og->bump_state.clear();
+    og->background_state.reset();
+  });
 }
 
 void OSLShaderManager::texture_system_init()
@@ -193,7 +220,7 @@ void OSLShaderManager::texture_system_init()
   /* create texture system, shared between different renders to reduce memory usage */
   thread_scoped_lock lock(ts_shared_mutex);
 
-  if (ts_shared_users == 0) {
+  if (ts_shared_users++ == 0) {
     ts_shared = TextureSystem::create(true);
 
     ts_shared->attribute("automip", 1);
@@ -203,24 +230,18 @@ void OSLShaderManager::texture_system_init()
     /* effectively unlimited for now, until we support proper mipmap lookups */
     ts_shared->attribute("max_memory_MB", 16384);
   }
-
-  ts = ts_shared;
-  ts_shared_users++;
 }
 
 void OSLShaderManager::texture_system_free()
 {
   /* shared texture system decrease users and destroy if no longer used */
   thread_scoped_lock lock(ts_shared_mutex);
-  ts_shared_users--;
 
-  if (ts_shared_users == 0) {
+  if (--ts_shared_users == 0) {
     ts_shared->invalidate_all(true);
     OSL::TextureSystem::destroy(ts_shared);
     ts_shared = NULL;
   }
-
-  ts = NULL;
 }
 
 void OSLShaderManager::shading_system_init()
@@ -228,101 +249,105 @@ void OSLShaderManager::shading_system_init()
   /* create shading system, shared between different renders to reduce memory usage */
   thread_scoped_lock lock(ss_shared_mutex);
 
-  if (ss_shared_users == 0) {
-    /* Must use aligned new due to concurrent hash map. */
-    services_shared = util_aligned_new<OSLRenderServices>(ts_shared);
+  device_->foreach_device([](Device *sub_device) {
+    const DeviceType device_type = sub_device->info.type;
 
-    string shader_path = path_get("shader");
+    if (ss_shared_users++ == 0 || ss_shared.find(device_type) == ss_shared.end()) {
+      /* Must use aligned new due to concurrent hash map. */
+      OSLRenderServices *services = util_aligned_new<OSLRenderServices>(ts_shared, device_type);
+
+      string shader_path = path_get("shader");
 #  ifdef _WIN32
-    /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
-     * operate with file paths with any character. This requires to use wide
-     * char functions, but OSL uses old fashioned ANSI functions which means:
-     *
-     * - We have to convert our paths to ANSI before passing to OSL
-     * - OSL can't be used when there's a multi-byte character in the path
-     *   to the shaders folder.
-     */
-    shader_path = string_to_ansi(shader_path);
+      /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
+       * operate with file paths with any character. This requires to use wide
+       * char functions, but OSL uses old fashioned ANSI functions which means:
+       *
+       * - We have to convert our paths to ANSI before passing to OSL
+       * - OSL can't be used when there's a multi-byte character in the path
+       *   to the shaders folder.
+       */
+      shader_path = string_to_ansi(shader_path);
 #  endif
 
-    ss_shared = new OSL::ShadingSystem(services_shared, ts_shared, &errhandler);
-    ss_shared->attribute("lockgeom", 1);
-    ss_shared->attribute("commonspace", "world");
-    ss_shared->attribute("searchpath:shader", shader_path);
-    ss_shared->attribute("greedyjit", 1);
-
-    VLOG_INFO << "Using shader search path: " << shader_path;
-
-    /* our own ray types */
-    static const char *raytypes[] = {
-        "camera",         /* PATH_RAY_CAMERA */
-        "reflection",     /* PATH_RAY_REFLECT */
-        "refraction",     /* PATH_RAY_TRANSMIT */
-        "diffuse",        /* PATH_RAY_DIFFUSE */
-        "glossy",         /* PATH_RAY_GLOSSY */
-        "singular",       /* PATH_RAY_SINGULAR */
-        "transparent",    /* PATH_RAY_TRANSPARENT */
-        "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
-
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
-
-        "__unused__", /* PATH_RAY_NODE_UNALIGNED */
-        "__unused__", /* PATH_RAY_MIS_SKIP */
-
-        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-
-        /* Remaining irrelevant bits up to 32. */
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-    };
-
-    const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
-    ss_shared->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
-
-    OSLRenderServices::register_closures(ss_shared);
-
-    loaded_shaders.clear();
-  }
+      OSL::ShadingSystem *ss = new OSL::ShadingSystem(services, ts_shared, &errhandler);
+      ss->attribute("lockgeom", 1);
+      ss->attribute("commonspace", "world");
+      ss->attribute("searchpath:shader", shader_path);
+      ss->attribute("greedyjit", 1);
+
+      VLOG_INFO << "Using shader search path: " << shader_path;
+
+      /* our own ray types */
+      static const char *raytypes[] = {
+          "camera",         /* PATH_RAY_CAMERA */
+          "reflection",     /* PATH_RAY_REFLECT */
+          "refraction",     /* PATH_RAY_TRANSMIT */
+          "diffuse",        /* PATH_RAY_DIFFUSE */
+          "glossy",         /* PATH_RAY_GLOSSY */
+          "singular",       /* PATH_RAY_SINGULAR */
+          "transparent",    /* PATH_RAY_TRANSPARENT */
+          "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+
+          "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+          "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+
+          "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+          "__unused__", /* PATH_RAY_MIS_SKIP */
+
+          "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+
+          /* Remaining irrelevant bits up to 32. */
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+      };
+
+      const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
+      ss->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
+
+      OSLRenderServices::register_closures(ss);
+
+      ss_shared[device_type] = ss;
+    }
+  });
 
-  ss = ss_shared;
-  services = services_shared;
-  ss_shared_users++;
+  loaded_shaders.clear();
 }
 
 void OSLShaderManager::shading_system_free()
 {
   /* shared shading system decrease users and destroy if no longer used */
   thread_scoped_lock lock(ss_shared_mutex);
-  ss_shared_users--;
 
-  if (ss_shared_users == 0) {
-    delete ss_shared;
-    ss_shared = NULL;
+  device_->foreach_device([](Device * /*sub_device*/) {
+    if (--ss_shared_users == 0) {
+      for (const auto &[device_type, ss] : ss_shared) {
+        OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
 
-    util_aligned_delete(services_shared);
-    services_shared = NULL;
-  }
+        delete ss;
+
+        util_aligned_delete(services);
+      }
 
-  ss = NULL;
-  services = NULL;
+      ss_shared.clear();
+    }
+  });
 }
 
 bool OSLShaderManager::osl_compile(const string &inputfile, const string &outputfile)
@@ -447,7 +472,9 @@ const char *OSLShaderManager::shader_load_filepath(string filepath)
 
 const char *OSLShaderManager::shader_load_bytecode(const string &hash, const string &bytecode)
 {
-  ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  for (const auto &[device_type, ss] : ss_shared) {
+    ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  }
 
   OSLShaderInfo info;
 
@@ -599,11 +626,11 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,
 
 /* Graph Compiler */
 
-OSLCompiler::OSLCompiler(OSLShaderManager *manager,
-                         OSLRenderServices *services,
-                         OSL::ShadingSystem *ss,
-                         Scene *scene)
-    : scene(scene), manager(manager), services(services), ss(ss)
+OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
+    : scene(scene),
+      manager(manager),
+      services(static_cast<OSLRenderServices *>(ss->renderer())),
+      ss(ss)
 {
   current_type = SHADER_TYPE_SURFACE;
   current_shader = NULL;
@@ -1105,7 +1132,12 @@ OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph
 {
   current_type = type;
 
-  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(shader->name.c_str());
+  string name = shader->name.string();
+  /* Replace invalid characters. */
+  for (size_t i; (i = name.find_first_of(" .,:;+-*/#")) != string::npos;)
+    name.replace(i, 1, "_");
+
+  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(name);
 
   ShaderNode *output = graph->output();
   ShaderNodeSet dependencies;
diff --git a/intern/cycles/scene/osl.h b/intern/cycles/scene/osl.h
index 76c6bd96ce1..c0e82a9dc8d 100644
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -54,7 +54,7 @@ struct OSLShaderInfo {
 
 class OSLShaderManager : public ShaderManager {
  public:
-  OSLShaderManager();
+  OSLShaderManager(Device *device);
   ~OSLShaderManager();
 
   static void free_memory();
@@ -92,25 +92,22 @@ class OSLShaderManager : public ShaderManager {
                            const std::string &bytecode_hash = "",
                            const std::string &bytecode = "");
 
- protected:
+ private:
   void texture_system_init();
   void texture_system_free();
 
   void shading_system_init();
   void shading_system_free();
 
-  OSL::ShadingSystem *ss;
-  OSL::TextureSystem *ts;
-  OSLRenderServices *services;
-  OSL::ErrorHandler errhandler;
+  Device *device_;
   map<string, OSLShaderInfo> loaded_shaders;
 
   static OSL::TextureSystem *ts_shared;
   static thread_mutex ts_shared_mutex;
   static int ts_shared_users;
 
-  static OSL::ShadingSystem *ss_shared;
-  static OSLRenderServices *services_shared;
+  static OSL::ErrorHandler errhandler;
+  static map<int, OSL::ShadingSystem *> ss_shared;
   static thread_mutex ss_shared_mutex;
   static thread_mutex ss_mutex;
   static int ss_shared_users;
@@ -123,10 +120,7 @@ class OSLShaderManager : public ShaderManager {
 class OSLCompiler {
  public:
 #ifdef WITH_OSL
-  OSLCompiler(OSLShaderManager *manager,
-              OSLRenderServices *services,
-              OSL::ShadingSystem *shadingsys,
-              Scene *scene);
+  OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *shadingsys, Scene *scene);
 #endif
   void compile(OSLGlobals *og, Shader *shader);
 
diff --git a/intern/cycles/scene/scene.cpp b/intern/cycles/scene/scene.cpp
index 3a05bede7a3..d5be86e1db9 100644
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -99,11 +99,8 @@ Scene::Scene(const SceneParams &params_, Device *device)
 {
   memset((void *)&dscene.data, 0, sizeof(dscene.data));
 
-  /* OSL only works on the CPU */
-  if (device->info.has_osl)
-    shader_manager = ShaderManager::create(params.shadingsystem);
-  else
-    shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM);
+  shader_manager = ShaderManager::create(
+      device->info.has_osl ? params.shadingsystem : SHADINGSYSTEM_SVM, device);
 
   light_manager = new LightManager();
   geometry_manager = new GeometryManager();
diff --git a/intern/cycles/scene/shader.cpp b/intern/cycles/scene/shader.cpp
index 56670c6e4e3..f176c19ec95 100644
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@@ -395,15 +395,16 @@ ShaderManager::~ShaderManager()
 {
 }
 
-ShaderManager *ShaderManager::create(int shadingsystem)
+ShaderManager *ShaderManager::create(int shadingsystem, Device *device)
 {
   ShaderManager *manager;
 
   (void)shadingsystem; /* Ignored when built without OSL. */
+  (void)device;
 
 #ifdef WITH_OSL
   if (shadingsystem == SHADINGSYSTEM_OSL) {
-    manager = new OSLShaderManager();
+    manager = new OSLShaderManager(device);
   }
   else
 #endif
@@ -722,6 +723,10 @@ uint ShaderManager::get_kernel_features(Scene *scene)
     }
   }
 
+  if (use_osl()) {
+    kernel_features |= KERNEL_FEATURE_OSL;
+  }
+
   return kernel_features;
 }
 
diff --git a/intern/cycles/scene/shader.h b/intern/cycles/scene/shader.h
index 2670776aca4..69b22d2ad19 100644
--- a/intern/cycles/scene/shader.h
+++ b/intern/cycles/scene/shader.h
@@ -170,7 +170,7 @@ class ShaderManager {
     UPDATE_NONE = 0u,
   };
 
-  static ShaderManager *create(int shadingsystem);
+  static ShaderManager *create(int shadingsystem, Device *device);
   virtual ~ShaderManager();
 
   virtual void reset(Scene *scene) = 0;
diff --git a/intern/cycles/scene/shader_nodes.h b/intern/cycles/scene/shader_nodes.h
index cc3a71a0697..a3a931bb0b3 100644
--- a/intern/cycles/scene/shader_nodes.h
+++ b/intern/cycles/scene/shader_nodes.h
@@ -1542,6 +1542,10 @@ class OSLNode final : public ShaderNode {
   {
     return true;
   }
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_RAYTRACE;
+  }
 
   virtual bool equals(const ShaderNode & /*other*/)
   {
diff --git a/intern/cycles/util/defines.h b/intern/cycles/util/defines.h
index 1969529eff0..d5be14c8eba 100644
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -23,6 +23,7 @@
 /* Leave inlining decisions to compiler for these, the inline keyword here
  * is not about performance but including function definitions in headers. */
 #  define ccl_device static inline
+#  define ccl_device_extern extern "C"
 #  define ccl_device_noinline static inline
 #  define ccl_device_noinline_cpu ccl_device_noinline
 
diff --git a/intern/cycles/util/transform.h b/intern/cycles/util/transform.h
index d7f95b7f296..0c39901a63c 100644
--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -196,14 +196,7 @@ ccl_device_inline Transform make_transform_frame(float3 N)
   return make_transform(dx.x, dx.y, dx.z, 0.0f, dy.x, dy.y, dy.z, 0.0f, N.x, N.y, N.z, 0.0f);
 }
 
-#ifndef __KERNEL_GPU__
-
-ccl_device_inline Transform transform_zero()
-{
-  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
-  return zero;
-}
-
+#if !defined(__KERNEL_METAL__)
 ccl_device_inline Transform operator*(const Transform a, const Transform b)
 {
   float4 c_x = make_float4(b.x.x, b.y.x, b.z.x, 0.0f);
@@ -218,6 +211,15 @@ ccl_device_inline Transform operator*(const Transform a, const Transform b)
 
   return t;
 }
+#endif
+
+#ifndef __KERNEL_GPU__
+
+ccl_device_inline Transform transform_zero()
+{
+  Transform zero = {zero_float4(), zero_float4(), zero_float4()};
+  return zero;
+}
 
 ccl_device_inline void print_transform(const char *label, const Transform &t)
 {
author	Patrick Mours <pmours@nvidia.com>	2022-11-09 16:25:32 +0300
committer	Patrick Mours <pmours@nvidia.com>	2022-11-09 17:30:21 +0300
commit	e6b38deb9dbb58118f6ee644409ce52f06eac5e5 (patch)
tree	aa1c384db146094482f24c94f704742c6624db00
parent	efe073f57c34b438d21750795e97458a3d007be7 (diff)