185 files changed, 12104 insertions, 9588 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 473bdb67920..a89c5679b27 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -37,7 +37,12 @@ set(SRC_KERNEL_DEVICE_OPTIX
   device/optix/kernel_shader_raytrace.cu
 )
 
+set(SRC_KERNEL_DEVICE_ONEAPI
+  device/oneapi/kernel.cpp
+)
+
 set(SRC_KERNEL_DEVICE_CPU_HEADERS
+  device/cpu/bvh.h
   device/cpu/compat.h
   device/cpu/image.h
   device/cpu/globals.h
@@ -67,17 +72,30 @@ set(SRC_KERNEL_DEVICE_HIP_HEADERS
 )
 
 set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
+  device/optix/bvh.h
   device/optix/compat.h
   device/optix/globals.h
 )
 
 set(SRC_KERNEL_DEVICE_METAL_HEADERS
+  device/metal/bvh.h
   device/metal/compat.h
   device/metal/context_begin.h
   device/metal/context_end.h
+  device/metal/function_constants.h
   device/metal/globals.h
 )
 
+set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS
+  device/oneapi/compat.h
+  device/oneapi/context_begin.h
+  device/oneapi/context_end.h
+  device/oneapi/globals.h
+  device/oneapi/image.h
+  device/oneapi/kernel.h
+  device/oneapi/kernel_templates.h
+)
+
 set(SRC_KERNEL_CLOSURE_HEADERS
   closure/alloc.h
   closure/bsdf.h
@@ -140,6 +158,7 @@ set(SRC_KERNEL_SVM_HEADERS
   svm/math_util.h
   svm/mix.h
   svm/musgrave.h
+  svm/node_types_template.h
   svm/noise.h
   svm/noisetex.h
   svm/normal.h
@@ -198,8 +217,6 @@ set(SRC_KERNEL_BVH_HEADERS
   bvh/util.h
   bvh/volume.h
   bvh/volume_all.h
-  bvh/embree.h
-  bvh/metal.h
 )
 
 set(SRC_KERNEL_CAMERA_HEADERS
@@ -208,15 +225,18 @@ set(SRC_KERNEL_CAMERA_HEADERS
 )
 
 set(SRC_KERNEL_FILM_HEADERS
-  film/accumulate.h
   film/adaptive_sampling.h
-  film/id_passes.h
-  film/passes.h
+  film/aov_passes.h
+  film/data_passes.h
+  film/denoising_passes.h
+  film/cryptomatte_passes.h
+  film/light_passes.h
   film/read.h
-  film/write_passes.h
+  film/write.h
 )
 
 set(SRC_KERNEL_INTEGRATOR_HEADERS
+  integrator/displacement_shader.h
   integrator/init_from_bake.h
   integrator/init_from_camera.h
   integrator/intersect_closest.h
@@ -228,7 +248,6 @@ set(SRC_KERNEL_INTEGRATOR_HEADERS
   integrator/path_state.h
   integrator/shade_background.h
   integrator/shade_light.h
-  integrator/shader_eval.h
   integrator/shade_shadow.h
   integrator/shade_surface.h
   integrator/shade_volume.h
@@ -241,6 +260,8 @@ set(SRC_KERNEL_INTEGRATOR_HEADERS
   integrator/subsurface_disk.h
   integrator/subsurface.h
   integrator/subsurface_random_walk.h
+  integrator/surface_shader.h
+  integrator/volume_shader.h
   integrator/volume_stack.h
 )
 
@@ -257,6 +278,8 @@ set(SRC_KERNEL_SAMPLE_HEADERS
   sample/mapping.h
   sample/mis.h
   sample/pattern.h
+  sample/sobol_burley.h
+  sample/util.h
 )
 
 set(SRC_KERNEL_UTIL_HEADERS
@@ -267,8 +290,9 @@ set(SRC_KERNEL_UTIL_HEADERS
 )
 
 set(SRC_KERNEL_TYPES_HEADERS
+  data_arrays.h
+  data_template.h
   tables.h
-  textures.h
   types.h
 )
 
@@ -299,6 +323,7 @@ set(SRC_UTIL_HEADERS
   ../util/math_float2.h
   ../util/math_float3.h
   ../util/math_float4.h
+  ../util/math_float8.h
   ../util/math_int2.h
   ../util/math_int3.h
   ../util/math_int4.h
@@ -307,6 +332,7 @@ set(SRC_UTIL_HEADERS
   ../util/rect.h
   ../util/static_assert.h
   ../util/transform.h
+  ../util/transform_inverse.h
   ../util/texture.h
   ../util/types.h
   ../util/types_float2.h
@@ -323,6 +349,7 @@ set(SRC_UTIL_HEADERS
   ../util/types_int3_impl.h
   ../util/types_int4.h
   ../util/types_int4_impl.h
+  ../util/types_spectrum.h
   ../util/types_uchar2.h
   ../util/types_uchar2_impl.h
   ../util/types_uchar3.h
@@ -336,8 +363,6 @@ set(SRC_UTIL_HEADERS
   ../util/types_uint4.h
   ../util/types_uint4_impl.h
   ../util/types_ushort4.h
-  ../util/types_vector3.h
-  ../util/types_vector3_impl.h
 )
 
 set(LIB
@@ -519,8 +544,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
   cycles_set_solution_folder(cycles_kernel_cuda)
 endif()
 
-####################################################### START
-
 # HIP module
 
 if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
@@ -595,7 +618,6 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
   cycles_set_solution_folder(cycles_kernel_hip)
 endif()
 
-####################################################### END
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
@@ -687,6 +709,201 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   cycles_set_solution_folder(cycles_kernel_optix)
 endif()
 
+# oneAPI module
+
+if(WITH_CYCLES_DEVICE_ONEAPI)
+  if(WIN32)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
+  else()
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so)
+  endif()
+
+  set(cycles_oneapi_kernel_sources
+    ${SRC_KERNEL_DEVICE_ONEAPI}
+    ${SRC_KERNEL_HEADERS}
+    ${SRC_KERNEL_DEVICE_GPU_HEADERS}
+    ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}
+    ${SRC_UTIL_HEADERS}
+  )
+
+  # SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options
+  set(sycl_compiler_flags
+      ${CMAKE_CURRENT_SOURCE_DIR}/${SRC_KERNEL_DEVICE_ONEAPI}
+      -fsycl
+      -fsycl-unnamed-lambda
+      -fdelayed-template-parsing
+      -mllvm -inlinedefault-threshold=300
+      -mllvm -inlinehint-threshold=400
+      -shared
+      -DWITH_ONEAPI
+      -ffast-math
+      -DNDEBUG
+      -O2
+      -o ${cycles_kernel_oneapi_lib}
+      -I${CMAKE_CURRENT_SOURCE_DIR}/..
+      ${SYCL_CPP_FLAGS}
+      )
+
+
+  if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
+    list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED)
+  endif()
+
+  # Set defaults for spir64 and spir64_gen options
+  if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
+    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
+  endif()
+  if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen)
+    SET (CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target")
+  endif()
+  # Enable zebin, a graphics binary format with improved compatibility.
+  string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ")
+  string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")
+
+  if (WITH_CYCLES_ONEAPI_BINARIES)
+    # Iterate over all targest and their options
+    list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string)
+    list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
+    foreach(target ${CYCLES_ONEAPI_SYCL_TARGETS})
+      if(DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_${target})
+        list (APPEND sycl_compiler_flags -Xsycl-target-backend=${target} "${CYCLES_ONEAPI_SYCL_OPTIONS_${target}}")
+      endif()
+    endforeach()
+  else()
+    # If AOT is disabled, build for spir64
+    list(APPEND sycl_compiler_flags
+      -fsycl-targets=spir64
+      -Xsycl-target-backend=spir64 "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}")
+  endif()
+
+  if(WITH_NANOVDB)
+    list(APPEND sycl_compiler_flags
+      -DWITH_NANOVDB
+      -I"${NANOVDB_INCLUDE_DIR}")
+  endif()
+
+  if(WITH_CYCLES_DEBUG)
+    list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG)
+  endif()
+
+  get_filename_component(sycl_compiler_root ${SYCL_COMPILER} DIRECTORY)
+  get_filename_component(sycl_compiler_compiler_name ${SYCL_COMPILER} NAME_WE)
+
+  if(UNIX AND NOT APPLE)
+    if(NOT WITH_CXX11_ABI)
+      check_library_exists(sycl
+        _ZN4sycl3_V17handler22verifyUsedKernelBundleERKSs ${sycl_compiler_root}/../lib SYCL_NO_CXX11_ABI)
+      if(SYCL_NO_CXX11_ABI)
+        list(APPEND sycl_compiler_flags -D_GLIBCXX_USE_CXX11_ABI=0)
+      endif()
+    endif()
+  endif()
+
+  if(WIN32)
+    list(APPEND sycl_compiler_flags
+    -fms-extensions
+    -fms-compatibility
+    -D_WINDLL
+    -D_MBCS
+    -DWIN32
+    -D_WINDOWS
+    -D_CRT_NONSTDC_NO_DEPRECATE
+    -D_CRT_SECURE_NO_DEPRECATE
+    -DONEAPI_EXPORT)
+
+    if(sycl_compiler_compiler_name MATCHES "dpcpp")
+      # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables.
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND "${sycl_compiler_root}/../../env/vars.bat"
+        COMMAND ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    else()
+      # The open source SYCL compiler just goes by clang++ and does not have such a script.
+      # Set the variables manually.
+      string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR})
+      if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows
+        get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY)
+        string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir})
+        get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE)
+      else()
+        set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION})
+      endif()
+      list(APPEND sycl_compiler_flags
+                  -L "${MSVC_TOOLS_DIR}/lib/x64"
+                  -L "${WINDOWS_KIT_DIR}/um/x64"
+                  -L "${WINDOWS_KIT_DIR}/ucrt/x64")
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND ${CMAKE_COMMAND} -E env
+                "LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib
+                "PATH=${OCLOC_INSTALL_DIR};${sycl_compiler_root}"
+                ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    endif()
+  else()
+    list(APPEND sycl_compiler_flags -fPIC)
+
+    # We avoid getting __FAST_MATH__ to be defined when building on CentOS 7 until the compilation crash
+    # it triggers at either AoT or JIT stages gets fixed.
+    list(APPEND sycl_compiler_flags -fhonor-nans)
+
+    # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and
+    # libpi_level_zero.so can be placed next to it and get found.
+    list(APPEND sycl_compiler_flags -Wl,-rpath,'$$ORIGIN')
+
+    # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables.
+    if(sycl_compiler_compiler_name MATCHES "dpcpp")
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND bash -c \"source ${sycl_compiler_root}/../../env/vars.sh&&${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}\"
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    else()
+      # The open source SYCL compiler just goes by clang++ and does not have such a script.
+      # Set the variables manually.
+      if(NOT IGC_INSTALL_DIR)
+        get_filename_component(IGC_INSTALL_DIR "${sycl_compiler_root}/../lib/igc" ABSOLUTE)
+      endif()
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND ${CMAKE_COMMAND} -E env
+                "LD_LIBRARY_PATH=${sycl_compiler_root}/../lib:${OCLOC_INSTALL_DIR}/lib:${IGC_INSTALL_DIR}/lib"
+                "PATH=${OCLOC_INSTALL_DIR}/bin:${sycl_compiler_root}:$ENV{PATH}" # env PATH is for compiler to find ld
+                ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    endif()
+  endif()
+
+  # install dynamic libraries required at runtime
+  if(WIN32)
+    set(SYCL_RUNTIME_DEPENDENCIES
+        sycl.dll
+        pi_level_zero.dll
+    )
+    if(NOT WITH_BLENDER)
+      # For the Cycles standalone put libraries next to the Cycles application.
+      delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH})
+    else()
+      # For Blender put the libraries next to the Blender executable.
+      #
+      # Note that the installation path in the delayed_install is relative to the versioned folder,
+      # which means we need to go one level up.
+      delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../")
+    endif()
+  elseif(UNIX AND NOT APPLE)
+    file(GLOB SYCL_RUNTIME_DEPENDENCIES
+              ${sycl_compiler_root}/../lib/libsycl.so
+              ${sycl_compiler_root}/../lib/libsycl.so.[0-9]
+              ${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
+    )
+    list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so)
+    delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib)
+  endif()
+
+  delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib)
+  add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib})
+endif()
+
 # OSL module
 
 if(WITH_CYCLES_OSL)
@@ -752,6 +969,7 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_KERNEL_DEVICE_HIP_HEADERS}
   ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
   ${SRC_KERNEL_DEVICE_METAL_HEADERS}
+  ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}
 )
 
 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -764,6 +982,7 @@ source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
 source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
+source_group("device\\oneapi" FILES ${SRC_KERNEL_DEVICE_ONEAPI} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -782,6 +1001,9 @@ endif()
 if(WITH_CYCLES_HIP)
   add_dependencies(cycles_kernel cycles_kernel_hip)
 endif()
+if(WITH_CYCLES_DEVICE_ONEAPI)
+  add_dependencies(cycles_kernel cycles_kernel_oneapi)
+endif()
 
 # Install kernel source for runtime compilation
 
diff --git a/intern/cycles/kernel/bake/bake.h b/intern/cycles/kernel/bake/bake.h
index 544a8217bef..384ca9168f0 100644
--- a/intern/cycles/kernel/bake/bake.h
+++ b/intern/cycles/kernel/bake/bake.h
@@ -4,10 +4,13 @@
 #pragma once
 
 #include "kernel/camera/projection.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/displacement_shader.h"
+#include "kernel/integrator/surface_shader.h"
 
 #include "kernel/geom/geom.h"
 
+#include "kernel/util/color.h"
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device void kernel_displace_evaluate(KernelGlobals kg,
@@ -23,20 +26,20 @@ ccl_device void kernel_displace_evaluate(KernelGlobals kg,
 
   /* Evaluate displacement shader. */
   const float3 P = sd.P;
-  shader_eval_displacement(kg, INTEGRATOR_STATE_NULL, &sd);
+  displacement_shader_eval(kg, INTEGRATOR_STATE_NULL, &sd);
   float3 D = sd.P - P;
 
   object_inverse_dir_transform(kg, &sd, &D);
 
 #ifdef __KERNEL_DEBUG_NAN__
-  if (!isfinite3_safe(D)) {
+  if (!isfinite_safe(D)) {
     kernel_assert(!"Cycles displacement with non-finite value detected");
   }
 #endif
 
   /* Ensure finite displacement, preventing BVH from becoming degenerate and avoiding possible
    * traversal issues caused by non-finite math. */
-  D = ensure_finite3(D);
+  D = ensure_finite(D);
 
   /* Write output. */
   output[offset * 3 + 0] += D.x;
@@ -62,24 +65,26 @@ ccl_device void kernel_background_evaluate(KernelGlobals kg,
   /* Evaluate shader.
    * This is being evaluated for all BSDFs, so path flag does not contain a specific type. */
   const uint32_t path_flag = PATH_RAY_EMISSION;
-  shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT &
+  surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT &
                       ~(KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_NODE_LIGHT_PATH)>(
       kg, INTEGRATOR_STATE_NULL, &sd, NULL, path_flag);
-  float3 color = shader_background_eval(&sd);
+  Spectrum color = surface_shader_background(&sd);
 
 #ifdef __KERNEL_DEBUG_NAN__
-  if (!isfinite3_safe(color)) {
+  if (!isfinite_safe(color)) {
     kernel_assert(!"Cycles background with non-finite value detected");
   }
 #endif
 
   /* Ensure finite color, avoiding possible numerical instabilities in the path tracing kernels. */
-  color = ensure_finite3(color);
+  color = ensure_finite(color);
+
+  float3 color_rgb = spectrum_to_rgb(color);
 
   /* Write output. */
-  output[offset * 3 + 0] += color.x;
-  output[offset * 3 + 1] += color.y;
-  output[offset * 3 + 2] += color.z;
+  output[offset * 3 + 0] += color_rgb.x;
+  output[offset * 3 + 1] += color_rgb.y;
+  output[offset * 3 + 2] += color_rgb.z;
 }
 
 ccl_device void kernel_curve_shadow_transparency_evaluate(
@@ -95,12 +100,12 @@ ccl_device void kernel_curve_shadow_transparency_evaluate(
   shader_setup_from_curve(kg, &sd, in.object, in.prim, __float_as_int(in.v), in.u);
 
   /* Evaluate transparency. */
-  shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW &
+  surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW &
                       ~(KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_NODE_LIGHT_PATH)>(
       kg, INTEGRATOR_STATE_NULL, &sd, NULL, PATH_RAY_SHADOW);
 
   /* Write output. */
-  output[offset] = clamp(average(shader_bsdf_transparency(kg, &sd)), 0.0f, 1.0f);
+  output[offset] = clamp(average(surface_shader_transparency(kg, &sd)), 0.0f, 1.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 04ccb7ceff5..29789a15b28 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -1,40 +1,47 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-/* BVH
- *
- * Bounding volume hierarchy for ray tracing. We compile different variations
- * of the same BVH traversal function for faster rendering when some types of
- * primitives are not needed, using #includes to work around the lack of
- * C++ templates in OpenCL.
- *
- * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
- * the code has been extended and modified to support more primitives and work
- * with CPU/CUDA/OpenCL. */
-
 #pragma once
 
-#ifdef __EMBREE__
-#  include "kernel/bvh/embree.h"
-#endif
-
-#ifdef __METALRT__
-#  include "kernel/bvh/metal.h"
-#endif
-
 #include "kernel/bvh/types.h"
 #include "kernel/bvh/util.h"
 
 #include "kernel/integrator/state_util.h"
 
+/* Device specific acceleration structures for ray tracing. */
+
+#if defined(__EMBREE__)
+#  include "kernel/device/cpu/bvh.h"
+#  define __BVH2__
+#elif defined(__METALRT__)
+#  include "kernel/device/metal/bvh.h"
+#elif defined(__KERNEL_OPTIX__)
+#  include "kernel/device/optix/bvh.h"
+#else
+#  define __BVH2__
+#endif
+
 CCL_NAMESPACE_BEGIN
 
-#if !defined(__KERNEL_GPU_RAYTRACING__)
+#ifdef __BVH2__
 
-/* Regular BVH traversal */
+/* BVH2
+ *
+ * Bounding volume hierarchy for ray tracing, when no native acceleration
+ * structure is available for the device.
+ *
+ * We compile different variations of the same BVH traversal function for
+ * faster rendering when some types of primitives are not needed, using #includes
+ * to work around the lack of C++ templates in OpenCL.
+ *
+ * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
+ * the code has been extended and modified to support more primitives and work
+ * with CPU and various GPU kernel languages. */
 
 #  include "kernel/bvh/nodes.h"
 
+/* Regular BVH traversal */
+
 #  define BVH_FUNCTION_NAME bvh_intersect
 #  define BVH_FUNCTION_FEATURES BVH_POINTCLOUD
 #  include "kernel/bvh/traversal.h"
@@ -57,260 +64,20 @@ CCL_NAMESPACE_BEGIN
 #    include "kernel/bvh/traversal.h"
 #  endif
 
-/* Subsurface scattering BVH traversal */
-
-#  if defined(__BVH_LOCAL__)
-#    define BVH_FUNCTION_NAME bvh_intersect_local
-#    define BVH_FUNCTION_FEATURES BVH_HAIR
-#    include "kernel/bvh/local.h"
-
-#    if defined(__OBJECT_MOTION__)
-#      define BVH_FUNCTION_NAME bvh_intersect_local_motion
-#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
-#      include "kernel/bvh/local.h"
-#    endif
-#  endif /* __BVH_LOCAL__ */
-
-/* Volume BVH traversal */
-
-#  if defined(__VOLUME__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume
-#    define BVH_FUNCTION_FEATURES BVH_HAIR
-#    include "kernel/bvh/volume.h"
-
-#    if defined(__OBJECT_MOTION__)
-#      define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
-#      include "kernel/bvh/volume.h"
-#    endif
-#  endif /* __VOLUME__ */
-
-/* Record all intersections - Shadow BVH traversal */
-
-#  if defined(__SHADOW_RECORD_ALL__)
-#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all
-#    define BVH_FUNCTION_FEATURES BVH_POINTCLOUD
-#    include "kernel/bvh/shadow_all.h"
-
-#    if defined(__HAIR__)
-#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
-#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_POINTCLOUD
-#      include "kernel/bvh/shadow_all.h"
-#    endif
-
-#    if defined(__OBJECT_MOTION__)
-#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
-#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_POINTCLOUD
-#      include "kernel/bvh/shadow_all.h"
-#    endif
-
-#    if defined(__HAIR__) && defined(__OBJECT_MOTION__)
-#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
-#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION | BVH_POINTCLOUD
-#      include "kernel/bvh/shadow_all.h"
-#    endif
-
-#  endif /* __SHADOW_RECORD_ALL__ */
-
-/* Record all intersections - Volume BVH traversal. */
-
-#  if defined(__VOLUME_RECORD_ALL__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume_all
-#    define BVH_FUNCTION_FEATURES BVH_HAIR
-#    include "kernel/bvh/volume_all.h"
-
-#    if defined(__OBJECT_MOTION__)
-#      define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
-#      include "kernel/bvh/volume_all.h"
-#    endif
-#  endif /* __VOLUME_RECORD_ALL__ */
-
-#  undef BVH_FEATURE
-#  undef BVH_NAME_JOIN
-#  undef BVH_NAME_EVAL
-#  undef BVH_FUNCTION_FULL_NAME
-
-#endif /* !defined(__KERNEL_GPU_RAYTRACING__) */
-
-ccl_device_inline bool scene_intersect_valid(ccl_private const Ray *ray)
-{
-  /* NOTE: Due to some vectorization code  non-finite origin point might
-   * cause lots of false-positive intersections which will overflow traversal
-   * stack.
-   * This code is a quick way to perform early output, to avoid crashes in
-   * such cases.
-   * From production scenes so far it seems it's enough to test first element
-   * only.
-   * Scene intersection may also called with empty rays for conditional trace
-   * calls that evaluate to false, so filter those out.
-   */
-  return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
-}
-
 ccl_device_intersect bool scene_intersect(KernelGlobals kg,
                                           ccl_private const Ray *ray,
                                           const uint visibility,
                                           ccl_private Intersection *isect)
 {
-#ifdef __KERNEL_OPTIX__
-  uint p0 = 0;
-  uint p1 = 0;
-  uint p2 = 0;
-  uint p3 = 0;
-  uint p4 = visibility;
-  uint p5 = PRIMITIVE_NONE;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  uint ray_mask = visibility & 0xFF;
-  uint ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
-  }
-
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
-             ray->P,
-             ray->D,
-             0.0f,
-             ray->t,
-             ray->time,
-             ray_mask,
-             ray_flags,
-             0, /* SBT offset for PG_HITD */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  isect->t = __uint_as_float(p0);
-  isect->u = __uint_as_float(p1);
-  isect->v = __uint_as_float(p2);
-  isect->prim = p3;
-  isect->object = p4;
-  isect->type = p5;
-
-  return p5 != PRIMITIVE_NONE;
-#elif defined(__METALRT__)
-
-  if (!scene_intersect_valid(ray)) {
-    isect->t = ray->t;
-    isect->type = PRIMITIVE_NONE;
-    return false;
-  }
-
-#  if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    isect->t = ray->t;
-    isect->type = PRIMITIVE_NONE;
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
-    return false;
-  }
-
-  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
-    isect->t = ray->t;
-    isect->type = PRIMITIVE_NONE;
-    kernel_assert(!"Invalid ift_default");
-    return false;
-  }
-#  endif
-
-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
-  metalrt_intersector_type metalrt_intersect;
-
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
-
-  MetalRTIntersectionPayload payload;
-  payload.self = ray->self;
-  payload.u = 0.0f;
-  payload.v = 0.0f;
-  payload.visibility = visibility;
-
-  typename metalrt_intersector_type::result_type intersection;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-    /* No further intersector setup required: Default MetalRT behavior is any-hit. */
-  }
-  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    /* No further intersector setup required: Shadow ray early termination is controlled by the
-     * intersection handler */
-  }
-
-#  if defined(__METALRT_MOTION__)
-  payload.time = ray->time;
-  intersection = metalrt_intersect.intersect(r,
-                                             metal_ancillaries->accel_struct,
-                                             ray_mask,
-                                             ray->time,
-                                             metal_ancillaries->ift_default,
-                                             payload);
-#  else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
-#  endif
-
-  if (intersection.type == intersection_type::none) {
-    isect->t = ray->t;
-    isect->type = PRIMITIVE_NONE;
-
-    return false;
-  }
-
-  isect->t = intersection.distance;
-
-  isect->prim = payload.prim;
-  isect->type = payload.type;
-  isect->object = intersection.user_instance_id;
-
-  isect->t = intersection.distance;
-  if (intersection.type == intersection_type::triangle) {
-    isect->u = 1.0f - intersection.triangle_barycentric_coord.y -
-               intersection.triangle_barycentric_coord.x;
-    isect->v = intersection.triangle_barycentric_coord.x;
-  }
-  else {
-    isect->u = payload.u;
-    isect->v = payload.v;
-  }
-
-  return isect->type != PRIMITIVE_NONE;
-
-#else
-
-  if (!scene_intersect_valid(ray)) {
+  if (!intersection_ray_valid(ray)) {
     return false;
   }
 
 #  ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
-    isect->t = ray->t;
-    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
-    IntersectContext rtc_ctx(&ctx);
-    RTCRayHit ray_hit;
-    ctx.ray = ray;
-    kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
-    rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit);
-    if (ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID &&
-        ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) {
-      kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
-      return true;
-    }
-    return false;
+  if (kernel_data.device_bvh) {
+    return kernel_embree_intersect(kg, ray, visibility, isect);
   }
-#  endif /* __EMBREE__ */
+#  endif
 
 #  ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
@@ -322,7 +89,7 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
 
     return bvh_intersect_motion(kg, ray, isect, visibility);
   }
-#  endif   /* __OBJECT_MOTION__ */
+#  endif /* __OBJECT_MOTION__ */
 
 #  ifdef __HAIR__
   if (kernel_data.bvh.have_curves) {
@@ -331,10 +98,22 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
 #  endif /* __HAIR__ */
 
   return bvh_intersect(kg, ray, isect, visibility);
-#endif   /* __KERNEL_OPTIX__ */
 }
 
-#ifdef __BVH_LOCAL__
+/* Single object BVH traversal, for SSS/AO/bevel. */
+
+#  ifdef __BVH_LOCAL__
+
+#    define BVH_FUNCTION_NAME bvh_intersect_local
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
+#    include "kernel/bvh/local.h"
+
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_local_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/local.h"
+#    endif
+
 ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
                                                 ccl_private const Ray *ray,
                                                 ccl_private LocalIntersection *local_isect,
@@ -342,108 +121,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
                                                 ccl_private uint *lcg_state,
                                                 int max_hits)
 {
-#  ifdef __KERNEL_OPTIX__
-  uint p0 = pointer_pack_to_uint_0(lcg_state);
-  uint p1 = pointer_pack_to_uint_1(lcg_state);
-  uint p2 = pointer_pack_to_uint_0(local_isect);
-  uint p3 = pointer_pack_to_uint_1(local_isect);
-  uint p4 = local_object;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
-  uint p5 = max_hits;
-
-  if (local_isect) {
-    local_isect->num_hits = 0; /* Initialize hit count to zero. */
-  }
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
-             ray->P,
-             ray->D,
-             0.0f,
-             ray->t,
-             ray->time,
-             0xFF,
-             /* Need to always call into __anyhit__kernel_optix_local_hit. */
-             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             2, /* SBT offset for PG_HITL */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  return p5;
-#  elif defined(__METALRT__)
-  if (!scene_intersect_valid(ray)) {
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    return false;
-  }
-
-#    if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
-    return false;
-  }
-
-  if (is_null_intersection_function_table(metal_ancillaries->ift_local)) {
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    kernel_assert(!"Invalid ift_local");
-    return false;
-  }
-#    endif
-
-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
-
-  MetalRTIntersectionLocalPayload payload;
-  payload.self = ray->self;
-  payload.local_object = local_object;
-  payload.max_hits = max_hits;
-  payload.local_isect.num_hits = 0;
-  if (lcg_state) {
-    payload.has_lcg_state = true;
-    payload.lcg_state = *lcg_state;
-  }
-  payload.result = false;
-
-  typename metalrt_intersector_type::result_type intersection;
-
-#    if defined(__METALRT_MOTION__)
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
-#    else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
-#    endif
-
-  if (lcg_state) {
-    *lcg_state = payload.lcg_state;
-  }
-  *local_isect = payload.local_isect;
-
-  return payload.result;
-
-#  else
-
-  if (!scene_intersect_valid(ray)) {
+  if (!intersection_ray_valid(ray)) {
     if (local_isect) {
       local_isect->num_hits = 0;
     }
@@ -451,59 +129,10 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
   }
 
 #    ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
-    const bool has_bvh = !(kernel_tex_fetch(__object_flag, local_object) &
-                           SD_OBJECT_TRANSFORM_APPLIED);
-    CCLIntersectContext ctx(
-        kg, has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
-    ctx.lcg_state = lcg_state;
-    ctx.max_hits = max_hits;
-    ctx.ray = ray;
-    ctx.local_isect = local_isect;
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    ctx.local_object_id = local_object;
-    IntersectContext rtc_ctx(&ctx);
-    RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
-
-    /* If this object has its own BVH, use it. */
-    if (has_bvh) {
-      RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2);
-      if (geom) {
-        float3 P = ray->P;
-        float3 dir = ray->D;
-        float3 idir = ray->D;
-        Transform ob_itfm;
-        rtc_ray.tfar = ray->t *
-                       bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
-        /* bvh_instance_motion_push() returns the inverse transform but
-         * it's not needed here. */
-        (void)ob_itfm;
-
-        rtc_ray.org_x = P.x;
-        rtc_ray.org_y = P.y;
-        rtc_ray.org_z = P.z;
-        rtc_ray.dir_x = dir.x;
-        rtc_ray.dir_y = dir.y;
-        rtc_ray.dir_z = dir.z;
-        RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
-        kernel_assert(scene);
-        if (scene) {
-          rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
-        }
-      }
-    }
-    else {
-      rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
-    }
-
-    /* rtcOccluded1 sets tfar to -inf if a hit was found. */
-    return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0);
-    ;
+  if (kernel_data.device_bvh) {
+    return kernel_embree_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
   }
-#    endif /* __EMBREE__ */
+#    endif
 
 #    ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
@@ -511,144 +140,55 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
   }
 #    endif /* __OBJECT_MOTION__ */
   return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
-#  endif   /* __KERNEL_OPTIX__ */
 }
-#endif
+#  endif
 
-#ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
-                                                     IntegratorShadowState state,
-                                                     ccl_private const Ray *ray,
-                                                     uint visibility,
-                                                     uint max_hits,
-                                                     ccl_private uint *num_recorded_hits,
-                                                     ccl_private float *throughput)
-{
-#  ifdef __KERNEL_OPTIX__
-  uint p0 = state;
-  uint p1 = __float_as_uint(1.0f); /* Throughput. */
-  uint p2 = 0;                     /* Number of hits. */
-  uint p3 = max_hits;
-  uint p4 = visibility;
-  uint p5 = false;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
+/* Transparent shadow BVH traversal, recording multiple intersections. */
 
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
-             ray->P,
-             ray->D,
-             0.0f,
-             ray->t,
-             ray->time,
-             ray_mask,
-             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
-             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             1, /* SBT offset for PG_HITS */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  *num_recorded_hits = uint16_unpack_from_uint_0(p2);
-  *throughput = __uint_as_float(p1);
-
-  return p5;
-#  elif defined(__METALRT__)
-
-  if (!scene_intersect_valid(ray)) {
-    return false;
-  }
+#  ifdef __SHADOW_RECORD_ALL__
 
-#    if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
-    return false;
-  }
+#    define BVH_FUNCTION_NAME bvh_intersect_shadow_all
+#    define BVH_FUNCTION_FEATURES BVH_POINTCLOUD
+#    include "kernel/bvh/shadow_all.h"
 
-  if (is_null_intersection_function_table(metal_ancillaries->ift_shadow)) {
-    kernel_assert(!"Invalid ift_shadow");
-    return false;
-  }
+#    if defined(__HAIR__)
+#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
+#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_POINTCLOUD
+#      include "kernel/bvh/shadow_all.h"
 #    endif
 
-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
-
-  MetalRTIntersectionShadowPayload payload;
-  payload.self = ray->self;
-  payload.visibility = visibility;
-  payload.max_hits = max_hits;
-  payload.num_hits = 0;
-  payload.num_recorded_hits = 0;
-  payload.throughput = 1.0f;
-  payload.result = false;
-  payload.state = state;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-
-  typename metalrt_intersector_type::result_type intersection;
-
-#    if defined(__METALRT_MOTION__)
-  payload.time = ray->time;
-  intersection = metalrt_intersect.intersect(r,
-                                             metal_ancillaries->accel_struct,
-                                             ray_mask,
-                                             ray->time,
-                                             metal_ancillaries->ift_shadow,
-                                             payload);
-#    else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_shadow, payload);
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_POINTCLOUD
+#      include "kernel/bvh/shadow_all.h"
 #    endif
 
-  *num_recorded_hits = payload.num_recorded_hits;
-  *throughput = payload.throughput;
-
-  return payload.result;
+#    if defined(__HAIR__) && defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
+#      define BVH_FUNCTION_FEATURES BVH_HAIR | BVH_MOTION | BVH_POINTCLOUD
+#      include "kernel/bvh/shadow_all.h"
+#    endif
 
-#  else
-  if (!scene_intersect_valid(ray)) {
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
+                                                     ccl_private const Ray *ray,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     ccl_private uint *num_recorded_hits,
+                                                     ccl_private float *throughput)
+{
+  if (!intersection_ray_valid(ray)) {
     *num_recorded_hits = 0;
     *throughput = 1.0f;
     return false;
   }
 
 #    ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
-    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
-    Intersection *isect_array = (Intersection *)state->shadow_isect;
-    ctx.isect_s = isect_array;
-    ctx.max_hits = max_hits;
-    ctx.ray = ray;
-    IntersectContext rtc_ctx(&ctx);
-    RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
-    rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
-
-    *num_recorded_hits = ctx.num_recorded_hits;
-    *throughput = ctx.throughput;
-    return ctx.opaque_hit;
+  if (kernel_data.device_bvh) {
+    return kernel_embree_intersect_shadow_all(
+        kg, state, ray, visibility, max_hits, num_recorded_hits, throughput);
   }
-#    endif /* __EMBREE__ */
+#    endif
 
 #    ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
@@ -662,7 +202,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
     return bvh_intersect_shadow_all_motion(
         kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
   }
-#    endif   /* __OBJECT_MOTION__ */
+#    endif /* __OBJECT_MOTION__ */
 
 #    ifdef __HAIR__
   if (kernel_data.bvh.have_curves) {
@@ -673,180 +213,89 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
 
   return bvh_intersect_shadow_all(
       kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
-#  endif   /* __KERNEL_OPTIX__ */
 }
-#endif /* __SHADOW_RECORD_ALL__ */
+#  endif /* __SHADOW_RECORD_ALL__ */
+
+/* Volume BVH traversal, for initializing or updating the volume stack. */
+
+#  if defined(__VOLUME__) && !defined(__VOLUME_RECORD_ALL__)
+
+#    define BVH_FUNCTION_NAME bvh_intersect_volume
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
+#    include "kernel/bvh/volume.h"
+
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/volume.h"
+#    endif
 
-#ifdef __VOLUME__
 ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
                                                  ccl_private const Ray *ray,
                                                  ccl_private Intersection *isect,
                                                  const uint visibility)
 {
-#  ifdef __KERNEL_OPTIX__
-  uint p0 = 0;
-  uint p1 = 0;
-  uint p2 = 0;
-  uint p3 = 0;
-  uint p4 = visibility;
-  uint p5 = PRIMITIVE_NONE;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
-             ray->P,
-             ray->D,
-             0.0f,
-             ray->t,
-             ray->time,
-             ray_mask,
-             /* Need to always call into __anyhit__kernel_optix_volume_test. */
-             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             3, /* SBT offset for PG_HITV */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  isect->t = __uint_as_float(p0);
-  isect->u = __uint_as_float(p1);
-  isect->v = __uint_as_float(p2);
-  isect->prim = p3;
-  isect->object = p4;
-  isect->type = p5;
-
-  return p5 != PRIMITIVE_NONE;
-#  elif defined(__METALRT__)
-
-  if (!scene_intersect_valid(ray)) {
-    return false;
-  }
-#    if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+  if (!intersection_ray_valid(ray)) {
     return false;
   }
 
-  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
-    kernel_assert(!"Invalid ift_default");
-    return false;
+#    ifdef __OBJECT_MOTION__
+  if (kernel_data.bvh.have_motion) {
+    return bvh_intersect_volume_motion(kg, ray, isect, visibility);
   }
-#    endif
-
-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
-  metalrt_intersector_type metalrt_intersect;
+#    endif /* __OBJECT_MOTION__ */
 
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
+  return bvh_intersect_volume(kg, ray, isect, visibility);
+}
+#  endif /* defined(__VOLUME__) && !defined(__VOLUME_RECORD_ALL__) */
 
-  MetalRTIntersectionPayload payload;
-  payload.self = ray->self;
-  payload.visibility = visibility;
+/* Volume BVH traversal, for initializing or updating the volume stack.
+ * Variation that records multiple intersections at once. */
 
-  typename metalrt_intersector_type::result_type intersection;
+#  if defined(__VOLUME__) && defined(__VOLUME_RECORD_ALL__)
 
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
+#    define BVH_FUNCTION_NAME bvh_intersect_volume_all
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
+#    include "kernel/bvh/volume_all.h"
 
-#    if defined(__METALRT_MOTION__)
-  payload.time = ray->time;
-  intersection = metalrt_intersect.intersect(r,
-                                             metal_ancillaries->accel_struct,
-                                             ray_mask,
-                                             ray->time,
-                                             metal_ancillaries->ift_default,
-                                             payload);
-#    else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/volume_all.h"
 #    endif
 
-  if (intersection.type == intersection_type::none) {
+ccl_device_intersect uint scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint max_hits,
+                                                 const uint visibility)
+{
+  if (!intersection_ray_valid(ray)) {
     return false;
   }
 
-  isect->prim = payload.prim;
-  isect->type = payload.type;
-  isect->object = intersection.user_instance_id;
-
-  isect->t = intersection.distance;
-  if (intersection.type == intersection_type::triangle) {
-    isect->u = 1.0f - intersection.triangle_barycentric_coord.y -
-               intersection.triangle_barycentric_coord.x;
-    isect->v = intersection.triangle_barycentric_coord.x;
-  }
-  else {
-    isect->u = payload.u;
-    isect->v = payload.v;
-  }
-
-  return isect->type != PRIMITIVE_NONE;
-
-#  else
-  if (!scene_intersect_valid(ray)) {
-    return false;
+#    ifdef __EMBREE__
+  if (kernel_data.device_bvh) {
+    return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility);
   }
+#    endif
 
 #    ifdef __OBJECT_MOTION__
   if (kernel_data.bvh.have_motion) {
-    return bvh_intersect_volume_motion(kg, ray, isect, visibility);
+    return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
   }
 #    endif /* __OBJECT_MOTION__ */
 
-  return bvh_intersect_volume(kg, ray, isect, visibility);
-#  endif   /* __KERNEL_OPTIX__ */
+  return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
 }
-#endif /* __VOLUME__ */
 
-#ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals kg,
-                                                     ccl_private const Ray *ray,
-                                                     ccl_private Intersection *isect,
-                                                     const uint max_hits,
-                                                     const uint visibility)
-{
-  if (!scene_intersect_valid(ray)) {
-    return false;
-  }
+#  endif /* defined(__VOLUME__) && defined(__VOLUME_RECORD_ALL__) */
 
-#  ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
-    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
-    ctx.isect_s = isect;
-    ctx.max_hits = max_hits;
-    ctx.num_hits = 0;
-    ctx.ray = ray;
-    IntersectContext rtc_ctx(&ctx);
-    RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
-    rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
-    return ctx.num_hits;
-  }
-#  endif /* __EMBREE__ */
-
-#  ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
-    return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
-  }
-#  endif /* __OBJECT_MOTION__ */
+#  undef BVH_FEATURE
+#  undef BVH_NAME_JOIN
+#  undef BVH_NAME_EVAL
+#  undef BVH_FUNCTION_FULL_NAME
 
-  return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
-}
-#endif /* __VOLUME_RECORD_ALL__ */
+#endif /* __BVH2__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/embree.h b/intern/cycles/kernel/bvh/embree.h
deleted file mode 100644
index 4f7e6435daf..00000000000
--- a/intern/cycles/kernel/bvh/embree.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2018-2022 Blender Foundation. */
-
-#pragma once
-
-#include <embree3/rtcore_ray.h>
-#include <embree3/rtcore_scene.h>
-
-#include "kernel/device/cpu/compat.h"
-#include "kernel/device/cpu/globals.h"
-
-#include "kernel/bvh/util.h"
-
-#include "util/vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct CCLIntersectContext {
-  typedef enum {
-    RAY_REGULAR = 0,
-    RAY_SHADOW_ALL = 1,
-    RAY_LOCAL = 2,
-    RAY_SSS = 3,
-    RAY_VOLUME_ALL = 4,
-  } RayType;
-
-  KernelGlobals kg;
-  RayType type;
-
-  /* For avoiding self intersections */
-  const Ray *ray;
-
-  /* for shadow rays */
-  Intersection *isect_s;
-  uint max_hits;
-  uint num_hits;
-  uint num_recorded_hits;
-  float throughput;
-  float max_t;
-  bool opaque_hit;
-
-  /* for SSS Rays: */
-  LocalIntersection *local_isect;
-  int local_object_id;
-  uint *lcg_state;
-
-  CCLIntersectContext(KernelGlobals kg_, RayType type_)
-  {
-    kg = kg_;
-    type = type_;
-    ray = NULL;
-    max_hits = 1;
-    num_hits = 0;
-    num_recorded_hits = 0;
-    throughput = 1.0f;
-    max_t = FLT_MAX;
-    opaque_hit = false;
-    isect_s = NULL;
-    local_isect = NULL;
-    local_object_id = -1;
-    lcg_state = NULL;
-  }
-};
-
-class IntersectContext {
- public:
-  IntersectContext(CCLIntersectContext *ctx)
-  {
-    rtcInitIntersectContext(&context);
-    userRayExt = ctx;
-  }
-  RTCIntersectContext context;
-  CCLIntersectContext *userRayExt;
-};
-
-ccl_device_inline void kernel_embree_setup_ray(const Ray &ray,
-                                               RTCRay &rtc_ray,
-                                               const uint visibility)
-{
-  rtc_ray.org_x = ray.P.x;
-  rtc_ray.org_y = ray.P.y;
-  rtc_ray.org_z = ray.P.z;
-  rtc_ray.dir_x = ray.D.x;
-  rtc_ray.dir_y = ray.D.y;
-  rtc_ray.dir_z = ray.D.z;
-  rtc_ray.tnear = 0.0f;
-  rtc_ray.tfar = ray.t;
-  rtc_ray.time = ray.time;
-  rtc_ray.mask = visibility;
-}
-
-ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
-                                                  RTCRayHit &rayhit,
-                                                  const uint visibility)
-{
-  kernel_embree_setup_ray(ray, rayhit.ray, visibility);
-  rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-  rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-}
-
-ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg,
-                                                          const RTCHit *hit,
-                                                          const Ray *ray)
-{
-  bool status = false;
-  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
-    const int oID = hit->instID[0] / 2;
-    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
-      RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-          rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0]));
-      const int pID = hit->primID +
-                      (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
-      status = intersection_skip_self_shadow(ray->self, oID, pID);
-    }
-  }
-  else {
-    const int oID = hit->geomID / 2;
-    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
-      const int pID = hit->primID + (intptr_t)rtcGetGeometryUserData(
-                                        rtcGetGeometry(kernel_data.bvh.scene, hit->geomID));
-      status = intersection_skip_self_shadow(ray->self, oID, pID);
-    }
-  }
-
-  return status;
-}
-
-ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg,
-                                                 const RTCRay *ray,
-                                                 const RTCHit *hit,
-                                                 Intersection *isect)
-{
-  isect->t = ray->tfar;
-  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
-    RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-        rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0]));
-    isect->prim = hit->primID +
-                  (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
-    isect->object = hit->instID[0] / 2;
-  }
-  else {
-    isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(
-                                    rtcGetGeometry(kernel_data.bvh.scene, hit->geomID));
-    isect->object = hit->geomID / 2;
-  }
-
-  const bool is_hair = hit->geomID & 1;
-  if (is_hair) {
-    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, isect->prim);
-    isect->type = segment.type;
-    isect->prim = segment.prim;
-    isect->u = hit->u;
-    isect->v = hit->v;
-  }
-  else {
-    isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type;
-    isect->u = 1.0f - hit->v - hit->u;
-    isect->v = hit->u;
-  }
-}
-
-ccl_device_inline void kernel_embree_convert_sss_hit(
-    KernelGlobals kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int object)
-{
-  isect->u = 1.0f - hit->v - hit->u;
-  isect->v = hit->u;
-  isect->t = ray->tfar;
-  RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-      rtcGetGeometry(kernel_data.bvh.scene, object * 2));
-  isect->prim = hit->primID +
-                (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
-  isect->object = object;
-  isect->type = kernel_tex_fetch(__objects, object).primitive_type;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/local.h b/intern/cycles/kernel/bvh/local.h
index 0d05e09d75f..add61adc126 100644
--- a/intern/cycles/kernel/bvh/local.h
+++ b/intern/cycles/kernel/bvh/local.h
@@ -41,27 +41,27 @@ ccl_device_inline
 
   /* traversal variables in registers */
   int stack_ptr = 0;
-  int node_addr = kernel_tex_fetch(__object_node, local_object);
+  int node_addr = kernel_data_fetch(object_node, local_object);
 
   /* ray parameters in registers */
   float3 P = ray->P;
   float3 dir = bvh_clamp_direction(ray->D);
   float3 idir = bvh_inverse_direction(dir);
+  float tmin = ray->tmin;
   int object = OBJECT_NONE;
-  float isect_t = ray->t;
+  float isect_t = ray->tmax;
 
   if (local_isect != NULL) {
     local_isect->num_hits = 0;
   }
   kernel_assert((local_isect == NULL) == (max_hits == 0));
 
-  const int object_flag = kernel_tex_fetch(__object_flag, local_object);
+  const int object_flag = kernel_data_fetch(object_flag, local_object);
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
-    Transform ob_itfm;
-    isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
+    bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir);
 #else
-    isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
+    bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #endif
     object = local_object;
   }
@@ -73,7 +73,7 @@ ccl_device_inline
       while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
         int node_addr_child1, traverse_mask;
         float dist[2];
-        float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+        float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
@@ -81,6 +81,7 @@ ccl_device_inline
                                        dir,
 #endif
                                        idir,
+                                       tmin,
                                        isect_t,
                                        node_addr,
                                        PATH_RAY_ALL_VISIBILITY,
@@ -117,7 +118,7 @@ ccl_device_inline
 
       /* if node is leaf, fetch triangle list */
       if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
+        float4 leaf = kernel_data_fetch(bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
         const int prim_addr2 = __float_as_int(leaf.y);
@@ -132,18 +133,18 @@ ccl_device_inline
           case PRIMITIVE_TRIANGLE: {
             /* intersect ray against primitive */
             for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+              kernel_assert(kernel_data_fetch(prim_type, prim_addr) == type);
 
               /* Only intersect with matching object, for instanced objects we
                * already know we are only intersecting the right object. */
               if (object == OBJECT_NONE) {
-                if (kernel_tex_fetch(__prim_object, prim_addr) != local_object) {
+                if (kernel_data_fetch(prim_object, prim_addr) != local_object) {
                   continue;
                 }
               }
 
               /* Skip self intersection. */
-              const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+              const int prim = kernel_data_fetch(prim_index, prim_addr);
               if (intersection_skip_self_local(ray->self, prim)) {
                 continue;
               }
@@ -155,6 +156,7 @@ ccl_device_inline
                                            local_object,
                                            prim,
                                            prim_addr,
+                                           tmin,
                                            isect_t,
                                            lcg_state,
                                            max_hits)) {
@@ -167,18 +169,18 @@ ccl_device_inline
           case PRIMITIVE_MOTION_TRIANGLE: {
             /* intersect ray against primitive */
             for (; prim_addr < prim_addr2; prim_addr++) {
-              kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+              kernel_assert(kernel_data_fetch(prim_type, prim_addr) == type);
 
               /* Only intersect with matching object, for instanced objects we
                * already know we are only intersecting the right object. */
               if (object == OBJECT_NONE) {
-                if (kernel_tex_fetch(__prim_object, prim_addr) != local_object) {
+                if (kernel_data_fetch(prim_object, prim_addr) != local_object) {
                   continue;
                 }
               }
 
               /* Skip self intersection. */
-              const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+              const int prim = kernel_data_fetch(prim_index, prim_addr);
               if (intersection_skip_self_local(ray->self, prim)) {
                 continue;
               }
@@ -191,6 +193,7 @@ ccl_device_inline
                                                   local_object,
                                                   prim,
                                                   prim_addr,
+                                                  tmin,
                                                   isect_t,
                                                   lcg_state,
                                                   max_hits)) {
diff --git a/intern/cycles/kernel/bvh/metal.h b/intern/cycles/kernel/bvh/metal.h
deleted file mode 100644
index 04289e259a7..00000000000
--- a/intern/cycles/kernel/bvh/metal.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2021-2022 Blender Foundation */
-
-struct MetalRTIntersectionPayload {
-  RaySelfPrimitives self;
-  uint visibility;
-  float u, v;
-  int prim;
-  int type;
-#if defined(__METALRT_MOTION__)
-  float time;
-#endif
-};
-
-struct MetalRTIntersectionLocalPayload {
-  RaySelfPrimitives self;
-  uint local_object;
-  uint lcg_state;
-  short max_hits;
-  bool has_lcg_state;
-  bool result;
-  LocalIntersection local_isect;
-};
-
-struct MetalRTIntersectionShadowPayload {
-  RaySelfPrimitives self;
-  uint visibility;
-#if defined(__METALRT_MOTION__)
-  float time;
-#endif
-  int state;
-  float throughput;
-  short max_hits;
-  short num_hits;
-  short num_recorded_hits;
-  bool result;
-};
diff --git a/intern/cycles/kernel/bvh/nodes.h b/intern/cycles/kernel/bvh/nodes.h
index fd475dcd5e9..e02841fad16 100644
--- a/intern/cycles/kernel/bvh/nodes.h
+++ b/intern/cycles/kernel/bvh/nodes.h
@@ -9,16 +9,17 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals kg
 {
   Transform space;
   const int child_addr = node_addr + child * 3;
-  space.x = kernel_tex_fetch(__bvh_nodes, child_addr + 1);
-  space.y = kernel_tex_fetch(__bvh_nodes, child_addr + 2);
-  space.z = kernel_tex_fetch(__bvh_nodes, child_addr + 3);
+  space.x = kernel_data_fetch(bvh_nodes, child_addr + 1);
+  space.y = kernel_data_fetch(bvh_nodes, child_addr + 2);
+  space.z = kernel_data_fetch(bvh_nodes, child_addr + 3);
   return space;
 }
 
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
                                                       const float3 P,
                                                       const float3 idir,
-                                                      const float t,
+                                                      const float tmin,
+                                                      const float tmax,
                                                       const int node_addr,
                                                       const uint visibility,
                                                       float dist[2])
@@ -26,11 +27,11 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
 
   /* fetch node data */
 #ifdef __VISIBILITY_FLAG__
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 #endif
-  float4 node0 = kernel_tex_fetch(__bvh_nodes, node_addr + 1);
-  float4 node1 = kernel_tex_fetch(__bvh_nodes, node_addr + 2);
-  float4 node2 = kernel_tex_fetch(__bvh_nodes, node_addr + 3);
+  float4 node0 = kernel_data_fetch(bvh_nodes, node_addr + 1);
+  float4 node1 = kernel_data_fetch(bvh_nodes, node_addr + 2);
+  float4 node2 = kernel_data_fetch(bvh_nodes, node_addr + 3);
 
   /* intersect ray against child nodes */
   float c0lox = (node0.x - P.x) * idir.x;
@@ -39,8 +40,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
   float c0hiy = (node1.z - P.y) * idir.y;
   float c0loz = (node2.x - P.z) * idir.z;
   float c0hiz = (node2.z - P.z) * idir.z;
-  float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
-  float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
+  float c0min = max4(tmin, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+  float c0max = min4(tmax, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
 
   float c1lox = (node0.y - P.x) * idir.x;
   float c1hix = (node0.w - P.x) * idir.x;
@@ -48,8 +49,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
   float c1hiy = (node1.w - P.y) * idir.y;
   float c1loz = (node2.y - P.z) * idir.z;
   float c1hiz = (node2.w - P.z) * idir.z;
-  float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
-  float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
+  float c1min = max4(tmin, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+  float c1max = min4(tmax, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
 
   dist[0] = c0min;
   dist[1] = c1min;
@@ -66,7 +67,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
 ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals kg,
                                                                const float3 P,
                                                                const float3 dir,
-                                                               const float t,
+                                                               const float tmin,
+                                                               const float tmax,
                                                                int node_addr,
                                                                int child,
                                                                float dist[2])
@@ -83,8 +85,8 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals kg,
   const float far_x = max(lower_xyz.x, upper_xyz.x);
   const float far_y = max(lower_xyz.y, upper_xyz.y);
   const float far_z = max(lower_xyz.z, upper_xyz.z);
-  const float tnear = max4(0.0f, near_x, near_y, near_z);
-  const float tfar = min4(t, far_x, far_y, far_z);
+  const float tnear = max4(tmin, near_x, near_y, near_z);
+  const float tfar = min4(tmax, far_x, far_y, far_z);
   *dist = tnear;
   return tnear <= tfar;
 }
@@ -93,16 +95,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg,
                                                         const float3 P,
                                                         const float3 dir,
                                                         const float3 idir,
-                                                        const float t,
+                                                        const float tmin,
+                                                        const float tmax,
                                                         const int node_addr,
                                                         const uint visibility,
                                                         float dist[2])
 {
   int mask = 0;
 #ifdef __VISIBILITY_FLAG__
-  float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+  float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 #endif
-  if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
+  if (bvh_unaligned_node_intersect_child(kg, P, dir, tmin, tmax, node_addr, 0, &dist[0])) {
 #ifdef __VISIBILITY_FLAG__
     if ((__float_as_uint(cnodes.x) & visibility))
 #endif
@@ -110,7 +113,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg,
       mask |= 1;
     }
   }
-  if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
+  if (bvh_unaligned_node_intersect_child(kg, P, dir, tmin, tmax, node_addr, 1, &dist[1])) {
 #ifdef __VISIBILITY_FLAG__
     if ((__float_as_uint(cnodes.y) & visibility))
 #endif
@@ -125,16 +128,17 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals kg,
                                               const float3 P,
                                               const float3 dir,
                                               const float3 idir,
-                                              const float t,
+                                              const float tmin,
+                                              const float tmax,
                                               const int node_addr,
                                               const uint visibility,
                                               float dist[2])
 {
-  float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
+  float4 node = kernel_data_fetch(bvh_nodes, node_addr);
   if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return bvh_unaligned_node_intersect(kg, P, dir, idir, t, node_addr, visibility, dist);
+    return bvh_unaligned_node_intersect(kg, P, dir, idir, tmin, tmax, node_addr, visibility, dist);
   }
   else {
-    return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist);
+    return bvh_aligned_node_intersect(kg, P, idir, tmin, tmax, node_addr, visibility, dist);
   }
 }
diff --git a/intern/cycles/kernel/bvh/shadow_all.h b/intern/cycles/kernel/bvh/shadow_all.h
index 2f58929c1e5..2ffe1496c72 100644
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -49,26 +49,15 @@ ccl_device_inline
   float3 P = ray->P;
   float3 dir = bvh_clamp_direction(ray->D);
   float3 idir = bvh_inverse_direction(dir);
+  float tmin = ray->tmin;
   int object = OBJECT_NONE;
   uint num_hits = 0;
 
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
   /* Max distance in world space. May be dynamically reduced when max number of
    * recorded hits is exceeded and we no longer need to find hits beyond the max
    * distance found. */
-  float t_max_world = ray->t;
-
-  /* Current maximum distance to the intersection.
-   * Is calculated as a ray length, transformed to an object space when entering
-   * instance node. */
-  float t_max_current = ray->t;
-
-  /* Conversion from world to local space for the current instance if any, 1.0
-   * otherwise. */
-  float t_world_to_instance = 1.0f;
+  const float tmax = ray->tmax;
+  float tmax_hits = tmax;
 
   *r_num_recorded_hits = 0;
   *r_throughput = 1.0f;
@@ -80,7 +69,7 @@ ccl_device_inline
       while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
         int node_addr_child1, traverse_mask;
         float dist[2];
-        float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+        float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
@@ -88,7 +77,8 @@ ccl_device_inline
                                        dir,
 #endif
                                        idir,
-                                       t_max_current,
+                                       tmin,
+                                       tmax,
                                        node_addr,
                                        visibility,
                                        dist);
@@ -124,7 +114,7 @@ ccl_device_inline
 
       /* if node is leaf, fetch triangle list */
       if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
+        float4 leaf = kernel_data_fetch(bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
         if (prim_addr >= 0) {
@@ -137,7 +127,7 @@ ccl_device_inline
 
           /* primitive intersection */
           for (; prim_addr < prim_addr2; prim_addr++) {
-            kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) ==
+            kernel_assert((kernel_data_fetch(prim_type, prim_addr) & PRIMITIVE_ALL) ==
                           (type & PRIMITIVE_ALL));
             bool hit;
 
@@ -147,9 +137,9 @@ ccl_device_inline
             Intersection isect ccl_optional_struct_init;
 
             const int prim_object = (object == OBJECT_NONE) ?
-                                        kernel_tex_fetch(__prim_object, prim_addr) :
+                                        kernel_data_fetch(prim_object, prim_addr) :
                                         object;
-            const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+            const int prim = kernel_data_fetch(prim_index, prim_addr);
             if (intersection_skip_self_shadow(ray->self, prim_object, prim)) {
               continue;
             }
@@ -157,7 +147,7 @@ ccl_device_inline
             switch (type & PRIMITIVE_ALL) {
               case PRIMITIVE_TRIANGLE: {
                 hit = triangle_intersect(
-                    kg, &isect, P, dir, t_max_current, visibility, prim_object, prim, prim_addr);
+                    kg, &isect, P, dir, tmin, tmax, visibility, prim_object, prim, prim_addr);
                 break;
               }
 #if BVH_FEATURE(BVH_MOTION)
@@ -166,7 +156,8 @@ ccl_device_inline
                                                 &isect,
                                                 P,
                                                 dir,
-                                                t_max_current,
+                                                tmin,
+                                                tmax,
                                                 ray->time,
                                                 visibility,
                                                 prim_object,
@@ -181,16 +172,16 @@ ccl_device_inline
               case PRIMITIVE_CURVE_RIBBON:
               case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
-                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
+                  const float2 prim_time = kernel_data_fetch(prim_time, prim_addr);
                   if (ray->time < prim_time.x || ray->time > prim_time.y) {
                     hit = false;
                     break;
                   }
                 }
 
-                const int curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+                const int curve_type = kernel_data_fetch(prim_type, prim_addr);
                 hit = curve_intersect(
-                    kg, &isect, P, dir, t_max_current, prim_object, prim, ray->time, curve_type);
+                    kg, &isect, P, dir, tmin, tmax, prim_object, prim, ray->time, curve_type);
 
                 break;
               }
@@ -199,16 +190,16 @@ ccl_device_inline
               case PRIMITIVE_POINT:
               case PRIMITIVE_MOTION_POINT: {
                 if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
-                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
+                  const float2 prim_time = kernel_data_fetch(prim_time, prim_addr);
                   if (ray->time < prim_time.x || ray->time > prim_time.y) {
                     hit = false;
                     break;
                   }
                 }
 
-                const int point_type = kernel_tex_fetch(__prim_type, prim_addr);
+                const int point_type = kernel_data_fetch(prim_type, prim_addr);
                 hit = point_intersect(
-                    kg, &isect, P, dir, t_max_current, prim_object, prim, ray->time, point_type);
+                    kg, &isect, P, dir, tmin, tmax, prim_object, prim, ray->time, point_type);
                 break;
               }
 #endif /* BVH_FEATURE(BVH_POINTCLOUD) */
@@ -220,9 +211,6 @@ ccl_device_inline
 
             /* shadow ray early termination */
             if (hit) {
-              /* Convert intersection distance to world space. */
-              isect.t /= t_world_to_instance;
-
               /* detect if this surface has a shader with transparent shadows */
               /* todo: optimize so primitive visibility flag indicates if
                * the primitive has a transparent shadow shader? */
@@ -254,7 +242,7 @@ ccl_device_inline
               if (record_intersection) {
                 /* Test if we need to record this transparent intersection. */
                 const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
-                if (*r_num_recorded_hits < max_record_hits || isect.t < t_max_world) {
+                if (*r_num_recorded_hits < max_record_hits || isect.t < tmax_hits) {
                   /* If maximum number of hits was reached, replace the intersection with the
                    * highest distance. We want to find the N closest intersections. */
                   const uint num_recorded_hits = min(*r_num_recorded_hits, max_record_hits);
@@ -276,7 +264,7 @@ ccl_device_inline
                     }
 
                     /* Limit the ray distance and stop counting hits beyond this. */
-                    t_max_world = max(isect.t, max_t);
+                    tmax_hits = max(isect.t, max_t);
                   }
 
                   integrator_state_write_shadow_isect(state, &isect, isect_index);
@@ -291,23 +279,19 @@ ccl_device_inline
         }
         else {
           /* instance push */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
+          object = kernel_data_fetch(prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          t_world_to_instance = bvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, &ob_itfm);
+          bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-          t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+          bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
-          /* Convert intersection to object space. */
-          t_max_current *= t_world_to_instance;
-
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
           traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-          node_addr = kernel_tex_fetch(__object_node, object);
+          node_addr = kernel_data_fetch(object_node, object);
         }
       }
     } while (node_addr != ENTRYPOINT_SENTINEL);
@@ -316,17 +300,9 @@ ccl_device_inline
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
-#if BVH_FEATURE(BVH_MOTION)
-      bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-      bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-
-      /* Restore world space ray length. */
-      t_max_current = ray->t;
+      bvh_instance_pop(ray, &P, &dir, &idir);
 
       object = OBJECT_NONE;
-      t_world_to_instance = 1.0f;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
diff --git a/intern/cycles/kernel/bvh/traversal.h b/intern/cycles/kernel/bvh/traversal.h
index 1181d4bfdee..f3744aca5c0 100644
--- a/intern/cycles/kernel/bvh/traversal.h
+++ b/intern/cycles/kernel/bvh/traversal.h
@@ -43,13 +43,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
   float3 P = ray->P;
   float3 dir = bvh_clamp_direction(ray->D);
   float3 idir = bvh_inverse_direction(dir);
+  const float tmin = ray->tmin;
   int object = OBJECT_NONE;
 
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
+  isect->t = ray->tmax;
   isect->u = 0.0f;
   isect->v = 0.0f;
   isect->prim = PRIM_NONE;
@@ -62,7 +59,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
       while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
         int node_addr_child1, traverse_mask;
         float dist[2];
-        float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+        float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 
         {
           traverse_mask = NODE_INTERSECT(kg,
@@ -71,6 +68,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                                          dir,
 #endif
                                          idir,
+                                         tmin,
                                          isect->t,
                                          node_addr,
                                          visibility,
@@ -108,7 +106,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
 
       /* if node is leaf, fetch triangle list */
       if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
+        float4 leaf = kernel_data_fetch(bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
         if (prim_addr >= 0) {
@@ -121,20 +119,28 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
 
           /* primitive intersection */
           for (; prim_addr < prim_addr2; prim_addr++) {
-            kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+            kernel_assert(kernel_data_fetch(prim_type, prim_addr) == type);
 
             const int prim_object = (object == OBJECT_NONE) ?
-                                        kernel_tex_fetch(__prim_object, prim_addr) :
+                                        kernel_data_fetch(prim_object, prim_addr) :
                                         object;
-            const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+            const int prim = kernel_data_fetch(prim_index, prim_addr);
             if (intersection_skip_self_shadow(ray->self, prim_object, prim)) {
               continue;
             }
 
             switch (type & PRIMITIVE_ALL) {
               case PRIMITIVE_TRIANGLE: {
-                if (triangle_intersect(
-                        kg, isect, P, dir, isect->t, visibility, prim_object, prim, prim_addr)) {
+                if (triangle_intersect(kg,
+                                       isect,
+                                       P,
+                                       dir,
+                                       tmin,
+                                       isect->t,
+                                       visibility,
+                                       prim_object,
+                                       prim,
+                                       prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -147,6 +153,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                                               isect,
                                               P,
                                               dir,
+                                              tmin,
                                               isect->t,
                                               ray->time,
                                               visibility,
@@ -166,15 +173,15 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
               case PRIMITIVE_CURVE_RIBBON:
               case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
-                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
+                  const float2 prim_time = kernel_data_fetch(prim_time, prim_addr);
                   if (ray->time < prim_time.x || ray->time > prim_time.y) {
                     break;
                   }
                 }
 
-                const int curve_type = kernel_tex_fetch(__prim_type, prim_addr);
+                const int curve_type = kernel_data_fetch(prim_type, prim_addr);
                 const bool hit = curve_intersect(
-                    kg, isect, P, dir, isect->t, prim_object, prim, ray->time, curve_type);
+                    kg, isect, P, dir, tmin, isect->t, prim_object, prim, ray->time, curve_type);
                 if (hit) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -187,15 +194,15 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
               case PRIMITIVE_POINT:
               case PRIMITIVE_MOTION_POINT: {
                 if ((type & PRIMITIVE_MOTION) && kernel_data.bvh.use_bvh_steps) {
-                  const float2 prim_time = kernel_tex_fetch(__prim_time, prim_addr);
+                  const float2 prim_time = kernel_data_fetch(prim_time, prim_addr);
                   if (ray->time < prim_time.x || ray->time > prim_time.y) {
                     break;
                   }
                 }
 
-                const int point_type = kernel_tex_fetch(__prim_type, prim_addr);
+                const int point_type = kernel_data_fetch(prim_type, prim_addr);
                 const bool hit = point_intersect(
-                    kg, isect, P, dir, isect->t, prim_object, prim, ray->time, point_type);
+                    kg, isect, P, dir, tmin, isect->t, prim_object, prim, ray->time, point_type);
                 if (hit) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -209,19 +216,19 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
         }
         else {
           /* instance push */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
+          object = kernel_data_fetch(prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
+          bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-          isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+          bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
           traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-          node_addr = kernel_tex_fetch(__object_node, object);
+          node_addr = kernel_data_fetch(object_node, object);
         }
       }
     } while (node_addr != ENTRYPOINT_SENTINEL);
@@ -230,11 +237,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
       kernel_assert(object != OBJECT_NONE);
 
       /* instance pop */
-#if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#endif
+      bvh_instance_pop(ray, &P, &dir, &idir);
 
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
diff --git a/intern/cycles/kernel/bvh/util.h b/intern/cycles/kernel/bvh/util.h
index d53198f97a3..a57703a8b8c 100644
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -5,7 +5,59 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__KERNEL_CPU__)
+ccl_device_inline bool intersection_ray_valid(ccl_private const Ray *ray)
+{
+  /* NOTE: Due to some vectorization code  non-finite origin point might
+   * cause lots of false-positive intersections which will overflow traversal
+   * stack.
+   * This code is a quick way to perform early output, to avoid crashes in
+   * such cases.
+   * From production scenes so far it seems it's enough to test first element
+   * only.
+   * Scene intersection may also called with empty rays for conditional trace
+   * calls that evaluate to false, so filter those out.
+   */
+  return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
+}
+
+/* Offset intersection distance by the smallest possible amount, to skip
+ * intersections at this distance. This works in cases where the ray start
+ * position is unchanged and only tmin is updated, since for self
+ * intersection we'll be comparing against the exact same distances. */
+ccl_device_forceinline float intersection_t_offset(const float t)
+{
+  /* This is a simplified version of `nextafterf(t, FLT_MAX)`, only dealing with
+   * non-negative and finite t. */
+  kernel_assert(t >= 0.0f && isfinite_safe(t));
+  const uint32_t bits = (t == 0.0f) ? 1 : __float_as_uint(t) + 1;
+  return __uint_as_float(bits);
+}
+
+/* Ray offset to avoid self intersection.
+ *
+ * This function can be used to compute a modified ray start position for rays
+ * leaving from a surface. This is from:
+ * "A Fast and Robust Method for Avoiding Self-Intersection"
+ * Ray Tracing Gems, chapter 6.
+ */
+ccl_device_inline float3 ray_offset(const float3 P, const float3 Ng)
+{
+  const float int_scale = 256.0f;
+  const int3 of_i = make_int3(
+      (int)(int_scale * Ng.x), (int)(int_scale * Ng.y), (int)(int_scale * Ng.z));
+
+  const float3 p_i = make_float3(
+      __int_as_float(__float_as_int(P.x) + ((P.x < 0) ? -of_i.x : of_i.x)),
+      __int_as_float(__float_as_int(P.y) + ((P.y < 0) ? -of_i.y : of_i.y)),
+      __int_as_float(__float_as_int(P.z) + ((P.z < 0) ? -of_i.z : of_i.z)));
+  const float origin = 1.0f / 32.0f;
+  const float float_scale = 1.0f / 65536.0f;
+  return make_float3(fabsf(P.x) < origin ? P.x + float_scale * Ng.x : p_i.x,
+                     fabsf(P.y) < origin ? P.y + float_scale * Ng.y : p_i.y,
+                     fabsf(P.z) < origin ? P.z + float_scale * Ng.z : p_i.z);
+}
+
+#ifndef __KERNEL_GPU__
 ccl_device int intersections_compare(const void *a, const void *b)
 {
   const Intersection *isect_a = (const Intersection *)a;
@@ -53,20 +105,20 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals kg,
   int shader = 0;
 
   if (type & PRIMITIVE_TRIANGLE) {
-    shader = kernel_tex_fetch(__tri_shader, prim);
+    shader = kernel_data_fetch(tri_shader, prim);
   }
 #ifdef __POINTCLOUD__
   else if (type & PRIMITIVE_POINT) {
-    shader = kernel_tex_fetch(__points_shader, prim);
+    shader = kernel_data_fetch(points_shader, prim);
   }
 #endif
 #ifdef __HAIR__
   else if (type & PRIMITIVE_CURVE) {
-    shader = kernel_tex_fetch(__curves, prim).shader_id;
+    shader = kernel_data_fetch(curves, prim).shader_id;
   }
 #endif
 
-  return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+  return kernel_data_fetch(shaders, (shader & SHADER_MASK)).flags;
 }
 
 ccl_device_forceinline int intersection_get_shader_from_isect_prim(KernelGlobals kg,
@@ -76,16 +128,16 @@ ccl_device_forceinline int intersection_get_shader_from_isect_prim(KernelGlobals
   int shader = 0;
 
   if (isect_type & PRIMITIVE_TRIANGLE) {
-    shader = kernel_tex_fetch(__tri_shader, prim);
+    shader = kernel_data_fetch(tri_shader, prim);
   }
 #ifdef __POINTCLOUD__
   else if (isect_type & PRIMITIVE_POINT) {
-    shader = kernel_tex_fetch(__points_shader, prim);
+    shader = kernel_data_fetch(points_shader, prim);
   }
 #endif
 #ifdef __HAIR__
   else if (isect_type & PRIMITIVE_CURVE) {
-    shader = kernel_tex_fetch(__curves, prim).shader_id;
+    shader = kernel_data_fetch(curves, prim).shader_id;
   }
 #endif
 
@@ -101,7 +153,7 @@ ccl_device_forceinline int intersection_get_shader(
 ccl_device_forceinline int intersection_get_object_flags(
     KernelGlobals kg, ccl_private const Intersection *ccl_restrict isect)
 {
-  return kernel_tex_fetch(__object_flag, isect->object);
+  return kernel_data_fetch(object_flag, isect->object);
 }
 
 /* TODO: find a better (faster) solution for this. Maybe store offset per object for
@@ -110,8 +162,8 @@ ccl_device_inline int intersection_find_attribute(KernelGlobals kg,
                                                   const int object,
                                                   const uint id)
 {
-  uint attr_offset = kernel_tex_fetch(__objects, object).attribute_map_offset;
-  AttributeMap attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+  uint attr_offset = kernel_data_fetch(objects, object).attribute_map_offset;
+  AttributeMap attr_map = kernel_data_fetch(attributes_map, attr_offset);
 
   while (attr_map.id != id) {
     if (UNLIKELY(attr_map.id == ATTR_STD_NONE)) {
@@ -126,7 +178,7 @@ ccl_device_inline int intersection_find_attribute(KernelGlobals kg,
     else {
       attr_offset += ATTR_PRIM_TYPES;
     }
-    attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+    attr_map = kernel_data_fetch(attributes_map, attr_offset);
   }
 
   /* return result */
@@ -151,12 +203,12 @@ ccl_device_inline float intersection_curve_shadow_transparency(KernelGlobals kg,
   }
 
   /* Interpolate transparency between curve keys. */
-  const KernelCurve kcurve = kernel_tex_fetch(__curves, prim);
+  const KernelCurve kcurve = kernel_data_fetch(curves, prim);
   const int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(kcurve.type);
   const int k1 = k0 + 1;
 
-  const float f0 = kernel_tex_fetch(__attributes_float, offset + k0);
-  const float f1 = kernel_tex_fetch(__attributes_float, offset + k1);
+  const float f0 = kernel_data_fetch(attributes_float, offset + k0);
+  const float f1 = kernel_data_fetch(attributes_float, offset + k1);
 
   return (1.0f - u) * f0 + u * f1;
 }
diff --git a/intern/cycles/kernel/bvh/volume.h b/intern/cycles/kernel/bvh/volume.h
index d711b3abbf4..664c692dd3d 100644
--- a/intern/cycles/kernel/bvh/volume.h
+++ b/intern/cycles/kernel/bvh/volume.h
@@ -46,13 +46,10 @@ ccl_device_inline
   float3 P = ray->P;
   float3 dir = bvh_clamp_direction(ray->D);
   float3 idir = bvh_inverse_direction(dir);
+  const float tmin = ray->tmin;
   int object = OBJECT_NONE;
 
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
-  isect->t = ray->t;
+  isect->t = ray->tmax;
   isect->u = 0.0f;
   isect->v = 0.0f;
   isect->prim = PRIM_NONE;
@@ -65,7 +62,7 @@ ccl_device_inline
       while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
         int node_addr_child1, traverse_mask;
         float dist[2];
-        float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+        float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
@@ -73,6 +70,7 @@ ccl_device_inline
                                        dir,
 #endif
                                        idir,
+                                       tmin,
                                        isect->t,
                                        node_addr,
                                        visibility,
@@ -109,7 +107,7 @@ ccl_device_inline
 
       /* if node is leaf, fetch triangle list */
       if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
+        float4 leaf = kernel_data_fetch(bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
         if (prim_addr >= 0) {
@@ -125,22 +123,22 @@ ccl_device_inline
             case PRIMITIVE_TRIANGLE: {
               /* intersect ray against primitive */
               for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                kernel_assert(kernel_data_fetch(prim_type, prim_addr) == type);
                 /* only primitives from volume object */
                 const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
+                                            kernel_data_fetch(prim_object, prim_addr) :
                                             object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                const int prim = kernel_data_fetch(prim_index, prim_addr);
                 if (intersection_skip_self(ray->self, prim_object, prim)) {
                   continue;
                 }
 
-                int object_flag = kernel_tex_fetch(__object_flag, prim_object);
+                int object_flag = kernel_data_fetch(object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
                 triangle_intersect(
-                    kg, isect, P, dir, isect->t, visibility, prim_object, prim, prim_addr);
+                    kg, isect, P, dir, tmin, isect->t, visibility, prim_object, prim, prim_addr);
               }
               break;
             }
@@ -148,16 +146,16 @@ ccl_device_inline
             case PRIMITIVE_MOTION_TRIANGLE: {
               /* intersect ray against primitive */
               for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                kernel_assert(kernel_data_fetch(prim_type, prim_addr) == type);
                 /* only primitives from volume object */
                 const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
+                                            kernel_data_fetch(prim_object, prim_addr) :
                                             object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                const int prim = kernel_data_fetch(prim_index, prim_addr);
                 if (intersection_skip_self(ray->self, prim_object, prim)) {
                   continue;
                 }
-                int object_flag = kernel_tex_fetch(__object_flag, prim_object);
+                int object_flag = kernel_data_fetch(object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
@@ -165,6 +163,7 @@ ccl_device_inline
                                           isect,
                                           P,
                                           dir,
+                                          tmin,
                                           isect->t,
                                           ray->time,
                                           visibility,
@@ -182,20 +181,20 @@ ccl_device_inline
         }
         else {
           /* instance push */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
+          object = kernel_data_fetch(prim_object, -prim_addr - 1);
+          int object_flag = kernel_data_fetch(object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
+            bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-            isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+            bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             ++stack_ptr;
             kernel_assert(stack_ptr < BVH_STACK_SIZE);
             traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-            node_addr = kernel_tex_fetch(__object_node, object);
+            node_addr = kernel_data_fetch(object_node, object);
           }
           else {
             /* pop */
@@ -211,11 +210,7 @@ ccl_device_inline
       kernel_assert(object != OBJECT_NONE);
 
       /* instance pop */
-#if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#endif
+      bvh_instance_pop(ray, &P, &dir, &idir);
 
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
diff --git a/intern/cycles/kernel/bvh/volume_all.h b/intern/cycles/kernel/bvh/volume_all.h
index a969bae14a1..721eb555d4d 100644
--- a/intern/cycles/kernel/bvh/volume_all.h
+++ b/intern/cycles/kernel/bvh/volume_all.h
@@ -44,21 +44,17 @@ ccl_device_inline
   int node_addr = kernel_data.bvh.root;
 
   /* ray parameters in registers */
-  const float tmax = ray->t;
   float3 P = ray->P;
   float3 dir = bvh_clamp_direction(ray->D);
   float3 idir = bvh_inverse_direction(dir);
+  const float tmin = ray->tmin;
   int object = OBJECT_NONE;
-  float isect_t = tmax;
-
-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
+  float isect_t = ray->tmax;
 
   int num_hits_in_instance = 0;
 
   uint num_hits = 0;
-  isect_array->t = tmax;
+  isect_array->t = ray->tmax;
 
   /* traversal loop */
   do {
@@ -67,7 +63,7 @@ ccl_device_inline
       while (node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
         int node_addr_child1, traverse_mask;
         float dist[2];
-        float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr + 0);
+        float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 
         traverse_mask = NODE_INTERSECT(kg,
                                        P,
@@ -75,6 +71,7 @@ ccl_device_inline
                                        dir,
 #endif
                                        idir,
+                                       tmin,
                                        isect_t,
                                        node_addr,
                                        visibility,
@@ -111,7 +108,7 @@ ccl_device_inline
 
       /* if node is leaf, fetch triangle list */
       if (node_addr < 0) {
-        float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr - 1));
+        float4 leaf = kernel_data_fetch(bvh_leaf_nodes, (-node_addr - 1));
         int prim_addr = __float_as_int(leaf.x);
 
         if (prim_addr >= 0) {
@@ -128,21 +125,29 @@ ccl_device_inline
             case PRIMITIVE_TRIANGLE: {
               /* intersect ray against primitive */
               for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                kernel_assert(kernel_data_fetch(prim_type, prim_addr) == type);
                 /* only primitives from volume object */
                 const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
+                                            kernel_data_fetch(prim_object, prim_addr) :
                                             object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                const int prim = kernel_data_fetch(prim_index, prim_addr);
                 if (intersection_skip_self(ray->self, prim_object, prim)) {
                   continue;
                 }
-                int object_flag = kernel_tex_fetch(__object_flag, prim_object);
+                int object_flag = kernel_data_fetch(object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                hit = triangle_intersect(
-                    kg, isect_array, P, dir, isect_t, visibility, prim_object, prim, prim_addr);
+                hit = triangle_intersect(kg,
+                                         isect_array,
+                                         P,
+                                         dir,
+                                         tmin,
+                                         isect_t,
+                                         visibility,
+                                         prim_object,
+                                         prim,
+                                         prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -150,18 +155,6 @@ ccl_device_inline
                   num_hits_in_instance++;
                   isect_array->t = isect_t;
                   if (num_hits == max_hits) {
-                    if (object != OBJECT_NONE) {
-#if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
                     return num_hits;
                   }
                 }
@@ -172,16 +165,16 @@ ccl_device_inline
             case PRIMITIVE_MOTION_TRIANGLE: {
               /* intersect ray against primitive */
               for (; prim_addr < prim_addr2; prim_addr++) {
-                kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
+                kernel_assert(kernel_data_fetch(prim_type, prim_addr) == type);
                 /* only primitives from volume object */
                 const int prim_object = (object == OBJECT_NONE) ?
-                                            kernel_tex_fetch(__prim_object, prim_addr) :
+                                            kernel_data_fetch(prim_object, prim_addr) :
                                             object;
-                const int prim = kernel_tex_fetch(__prim_index, prim_addr);
+                const int prim = kernel_data_fetch(prim_index, prim_addr);
                 if (intersection_skip_self(ray->self, prim_object, prim)) {
                   continue;
                 }
-                int object_flag = kernel_tex_fetch(__object_flag, prim_object);
+                int object_flag = kernel_data_fetch(object_flag, prim_object);
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
@@ -189,6 +182,7 @@ ccl_device_inline
                                                 isect_array,
                                                 P,
                                                 dir,
+                                                tmin,
                                                 isect_t,
                                                 ray->time,
                                                 visibility,
@@ -202,18 +196,6 @@ ccl_device_inline
                   num_hits_in_instance++;
                   isect_array->t = isect_t;
                   if (num_hits == max_hits) {
-                    if (object != OBJECT_NONE) {
-#  if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
                     return num_hits;
                   }
                 }
@@ -228,13 +210,13 @@ ccl_device_inline
         }
         else {
           /* instance push */
-          object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
-          int object_flag = kernel_tex_fetch(__object_flag, object);
+          object = kernel_data_fetch(prim_object, -prim_addr - 1);
+          int object_flag = kernel_data_fetch(object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
+            bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-            isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+            bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             num_hits_in_instance = 0;
@@ -244,7 +226,7 @@ ccl_device_inline
             kernel_assert(stack_ptr < BVH_STACK_SIZE);
             traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
 
-            node_addr = kernel_tex_fetch(__object_node, object);
+            node_addr = kernel_data_fetch(object_node, object);
           }
           else {
             /* pop */
@@ -260,28 +242,7 @@ ccl_device_inline
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
+      bvh_instance_pop(ray, &P, &dir, &idir);
 
       object = OBJECT_NONE;
       node_addr = traversal_stack[stack_ptr];
diff --git a/intern/cycles/kernel/camera/camera.h b/intern/cycles/kernel/camera/camera.h
index aad68e527ac..27876677281 100644
--- a/intern/cycles/kernel/camera/camera.h
+++ b/intern/cycles/kernel/camera/camera.h
@@ -45,7 +45,6 @@ ccl_device void camera_sample_perspective(KernelGlobals kg,
   float3 raster = make_float3(raster_x, raster_y, 0.0f);
   float3 Pcamera = transform_perspective(&rastertocamera, raster);
 
-#ifdef __CAMERA_MOTION__
   if (kernel_data.cam.have_perspective_motion) {
     /* TODO(sergey): Currently we interpolate projected coordinate which
      * gives nice looking result and which is simple, but is in fact a bit
@@ -63,7 +62,6 @@ ccl_device void camera_sample_perspective(KernelGlobals kg,
       Pcamera = interp(Pcamera, Pcamera_post, (ray->time - 0.5f) * 2.0f);
     }
   }
-#endif
 
   float3 P = zero_float3();
   float3 D = Pcamera;
@@ -87,14 +85,12 @@ ccl_device void camera_sample_perspective(KernelGlobals kg,
   /* transform ray from camera to world */
   Transform cameratoworld = kernel_data.cam.cameratoworld;
 
-#ifdef __CAMERA_MOTION__
   if (kernel_data.cam.num_motion_steps) {
     transform_motion_array_interpolate(&cameratoworld,
-                                       kernel_tex_array(__camera_motion),
+                                       kernel_data_array(camera_motion),
                                        kernel_data.cam.num_motion_steps,
                                        ray->time);
   }
-#endif
 
   P = transform_point(&cameratoworld, P);
   D = normalize(transform_direction(&cameratoworld, D));
@@ -159,16 +155,13 @@ ccl_device void camera_sample_perspective(KernelGlobals kg,
 #endif
   }
 
-#ifdef __CAMERA_CLIPPING__
   /* clipping */
   float z_inv = 1.0f / normalize(Pcamera).z;
   float nearclip = kernel_data.cam.nearclip * z_inv;
   ray->P += nearclip * ray->D;
   ray->dP += nearclip * ray->dD;
-  ray->t = kernel_data.cam.cliplength * z_inv;
-#else
-  ray->t = FLT_MAX;
-#endif
+  ray->tmin = 0.0f;
+  ray->tmax = kernel_data.cam.cliplength * z_inv;
 }
 
 /* Orthographic Camera */
@@ -207,14 +200,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals kg,
   /* transform ray from camera to world */
   Transform cameratoworld = kernel_data.cam.cameratoworld;
 
-#ifdef __CAMERA_MOTION__
   if (kernel_data.cam.num_motion_steps) {
     transform_motion_array_interpolate(&cameratoworld,
-                                       kernel_tex_array(__camera_motion),
+                                       kernel_data_array(camera_motion),
                                        kernel_data.cam.num_motion_steps,
                                        ray->time);
   }
-#endif
 
   ray->P = transform_point(&cameratoworld, P);
   ray->D = normalize(transform_direction(&cameratoworld, D));
@@ -229,20 +220,15 @@ ccl_device void camera_sample_orthographic(KernelGlobals kg,
   ray->dD = differential_zero_compact();
 #endif
 
-#ifdef __CAMERA_CLIPPING__
   /* clipping */
-  ray->t = kernel_data.cam.cliplength;
-#else
-  ray->t = FLT_MAX;
-#endif
+  ray->tmin = 0.0f;
+  ray->tmax = kernel_data.cam.cliplength;
 }
 
 /* Panorama Camera */
 
 ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
-#ifdef __CAMERA_MOTION__
                                               ccl_global const DecomposedTransform *cam_motion,
-#endif
                                               float raster_x,
                                               float raster_y,
                                               float lens_u,
@@ -258,7 +244,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
 
   /* indicates ray should not receive any light, outside of the lens */
   if (is_zero(D)) {
-    ray->t = 0.0f;
+    ray->tmax = 0.0f;
     return;
   }
 
@@ -286,12 +272,10 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
   /* transform ray from camera to world */
   Transform cameratoworld = cam->cameratoworld;
 
-#ifdef __CAMERA_MOTION__
   if (cam->num_motion_steps) {
     transform_motion_array_interpolate(
         &cameratoworld, cam_motion, cam->num_motion_steps, ray->time);
   }
-#endif
 
   /* Stereo transform */
   bool use_stereo = cam->interocular_offset != 0.0f;
@@ -344,15 +328,12 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
   ray->dP = differential_make_compact(dP);
 #endif
 
-#ifdef __CAMERA_CLIPPING__
   /* clipping */
   float nearclip = cam->nearclip;
   ray->P += nearclip * ray->D;
   ray->dP += nearclip * ray->dD;
-  ray->t = cam->cliplength;
-#else
-  ray->t = FLT_MAX;
-#endif
+  ray->tmin = 0.0f;
+  ray->tmax = cam->cliplength;
 }
 
 /* Common */
@@ -368,11 +349,10 @@ ccl_device_inline void camera_sample(KernelGlobals kg,
                                      ccl_private Ray *ray)
 {
   /* pixel filter */
-  int filter_table_offset = kernel_data.film.filter_table_offset;
+  int filter_table_offset = kernel_data.tables.filter_table_offset;
   float raster_x = x + lookup_table_read(kg, filter_u, filter_table_offset, FILTER_TABLE_SIZE);
   float raster_y = y + lookup_table_read(kg, filter_v, filter_table_offset, FILTER_TABLE_SIZE);
 
-#ifdef __CAMERA_MOTION__
   /* motion blur */
   if (kernel_data.cam.shuttertime == -1.0f) {
     ray->time = 0.5f;
@@ -410,7 +390,6 @@ ccl_device_inline void camera_sample(KernelGlobals kg,
       }
     }
   }
-#endif
 
   /* sample */
   if (kernel_data.cam.type == CAMERA_PERSPECTIVE) {
@@ -420,12 +399,8 @@ ccl_device_inline void camera_sample(KernelGlobals kg,
     camera_sample_orthographic(kg, raster_x, raster_y, lens_u, lens_v, ray);
   }
   else {
-#ifdef __CAMERA_MOTION__
-    ccl_global const DecomposedTransform *cam_motion = kernel_tex_array(__camera_motion);
+    ccl_global const DecomposedTransform *cam_motion = kernel_data_array(camera_motion);
     camera_sample_panorama(&kernel_data.cam, cam_motion, raster_x, raster_y, lens_u, lens_v, ray);
-#else
-    camera_sample_panorama(&kernel_data.cam, raster_x, raster_y, lens_u, lens_v, ray);
-#endif
   }
 }
 
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index a6975a63d5d..1cf06614f3b 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -8,7 +8,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device ccl_private ShaderClosure *closure_alloc(ccl_private ShaderData *sd,
                                                     int size,
                                                     ClosureType type,
-                                                    float3 weight)
+                                                    Spectrum weight)
 {
   kernel_assert(size <= sizeof(ShaderClosure));
 
@@ -49,9 +49,9 @@ ccl_device ccl_private void *closure_alloc_extra(ccl_private ShaderData *sd, int
 
 ccl_device_inline ccl_private ShaderClosure *bsdf_alloc(ccl_private ShaderData *sd,
                                                         int size,
-                                                        float3 weight)
+                                                        Spectrum weight)
 {
-  kernel_assert(isfinite3_safe(weight));
+  kernel_assert(isfinite_safe(weight));
 
   const float sample_weight = fabsf(average(weight));
 
@@ -59,39 +59,10 @@ ccl_device_inline ccl_private ShaderClosure *bsdf_alloc(ccl_private ShaderData *
    * we will not allocate new closure. */
   if (sample_weight >= CLOSURE_WEIGHT_CUTOFF) {
     ccl_private ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
-    if (sc == NULL) {
-      return NULL;
-    }
-
-    sc->sample_weight = sample_weight;
-
-    return sc;
-  }
-
-  return NULL;
-}
-
-#ifdef __OSL__
-ccl_device_inline ShaderClosure *bsdf_alloc_osl(ShaderData *sd,
-                                                int size,
-                                                float3 weight,
-                                                void *data)
-{
-  kernel_assert(isfinite3_safe(weight));
-
-  const float sample_weight = fabsf(average(weight));
-
-  /* Use comparison this way to help dealing with non-finite weight: if the average is not finite
-   * we will not allocate new closure. */
-  if (sample_weight >= CLOSURE_WEIGHT_CUTOFF) {
-    ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
     if (!sc) {
       return NULL;
     }
 
-    memcpy((void *)sc, data, size);
-
-    sc->weight = weight;
     sc->sample_weight = sample_weight;
 
     return sc;
@@ -99,6 +70,5 @@ ccl_device_inline ShaderClosure *bsdf_alloc_osl(ShaderData *sd,
 
   return NULL;
 }
-#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 011155cdf5f..f0b28ff77c4 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -103,9 +103,8 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
                                   ccl_private const ShaderClosure *sc,
                                   float randu,
                                   float randv,
-                                  ccl_private float3 *eval,
+                                  ccl_private Spectrum *eval,
                                   ccl_private float3 *omega_in,
-                                  ccl_private differential3 *domega_in,
                                   ccl_private float *pdf)
 {
   /* For curves use the smooth normal, particularly for ribbons the geometric
@@ -115,306 +114,80 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
 
   switch (sc->type) {
     case CLOSURE_BSDF_DIFFUSE_ID:
-      label = bsdf_diffuse_sample(sc,
-                                  Ng,
-                                  sd->I,
-                                  sd->dI.dx,
-                                  sd->dI.dy,
-                                  randu,
-                                  randv,
-                                  eval,
-                                  omega_in,
-                                  &domega_in->dx,
-                                  &domega_in->dy,
-                                  pdf);
+      label = bsdf_diffuse_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
-#ifdef __SVM__
+#if defined(__SVM__) || defined(__OSL__)
     case CLOSURE_BSDF_OREN_NAYAR_ID:
-      label = bsdf_oren_nayar_sample(sc,
-                                     Ng,
-                                     sd->I,
-                                     sd->dI.dx,
-                                     sd->dI.dy,
-                                     randu,
-                                     randv,
-                                     eval,
-                                     omega_in,
-                                     &domega_in->dx,
-                                     &domega_in->dy,
-                                     pdf);
+      label = bsdf_oren_nayar_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
 #  ifdef __OSL__
     case CLOSURE_BSDF_PHONG_RAMP_ID:
-      label = bsdf_phong_ramp_sample(sc,
-                                     Ng,
-                                     sd->I,
-                                     sd->dI.dx,
-                                     sd->dI.dy,
-                                     randu,
-                                     randv,
-                                     eval,
-                                     omega_in,
-                                     &domega_in->dx,
-                                     &domega_in->dy,
-                                     pdf);
+      label = bsdf_phong_ramp_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-      label = bsdf_diffuse_ramp_sample(sc,
-                                       Ng,
-                                       sd->I,
-                                       sd->dI.dx,
-                                       sd->dI.dy,
-                                       randu,
-                                       randv,
-                                       eval,
-                                       omega_in,
-                                       &domega_in->dx,
-                                       &domega_in->dy,
-                                       pdf);
+      label = bsdf_diffuse_ramp_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
 #  endif
     case CLOSURE_BSDF_TRANSLUCENT_ID:
-      label = bsdf_translucent_sample(sc,
-                                      Ng,
-                                      sd->I,
-                                      sd->dI.dx,
-                                      sd->dI.dy,
-                                      randu,
-                                      randv,
-                                      eval,
-                                      omega_in,
-                                      &domega_in->dx,
-                                      &domega_in->dy,
-                                      pdf);
+      label = bsdf_translucent_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_REFLECTION_ID:
-      label = bsdf_reflection_sample(sc,
-                                     Ng,
-                                     sd->I,
-                                     sd->dI.dx,
-                                     sd->dI.dy,
-                                     randu,
-                                     randv,
-                                     eval,
-                                     omega_in,
-                                     &domega_in->dx,
-                                     &domega_in->dy,
-                                     pdf);
+      label = bsdf_reflection_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_REFRACTION_ID:
-      label = bsdf_refraction_sample(sc,
-                                     Ng,
-                                     sd->I,
-                                     sd->dI.dx,
-                                     sd->dI.dy,
-                                     randu,
-                                     randv,
-                                     eval,
-                                     omega_in,
-                                     &domega_in->dx,
-                                     &domega_in->dy,
-                                     pdf);
+      label = bsdf_refraction_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_TRANSPARENT_ID:
-      label = bsdf_transparent_sample(sc,
-                                      Ng,
-                                      sd->I,
-                                      sd->dI.dx,
-                                      sd->dI.dy,
-                                      randu,
-                                      randv,
-                                      eval,
-                                      omega_in,
-                                      &domega_in->dx,
-                                      &domega_in->dy,
-                                      pdf);
+      label = bsdf_transparent_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_MICROFACET_GGX_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
     case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-      label = bsdf_microfacet_ggx_sample(kg,
-                                         sc,
-                                         Ng,
-                                         sd->I,
-                                         sd->dI.dx,
-                                         sd->dI.dy,
-                                         randu,
-                                         randv,
-                                         eval,
-                                         omega_in,
-                                         &domega_in->dx,
-                                         &domega_in->dy,
-                                         pdf);
+      label = bsdf_microfacet_ggx_sample(kg, sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-      label = bsdf_microfacet_multi_ggx_sample(kg,
-                                               sc,
-                                               Ng,
-                                               sd->I,
-                                               sd->dI.dx,
-                                               sd->dI.dy,
-                                               randu,
-                                               randv,
-                                               eval,
-                                               omega_in,
-                                               &domega_in->dx,
-                                               &domega_in->dy,
-                                               pdf,
-                                               &sd->lcg_state);
+      label = bsdf_microfacet_multi_ggx_sample(
+          kg, sc, Ng, sd->I, randu, randv, eval, omega_in, pdf, &sd->lcg_state);
       break;
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-      label = bsdf_microfacet_multi_ggx_glass_sample(kg,
-                                                     sc,
-                                                     Ng,
-                                                     sd->I,
-                                                     sd->dI.dx,
-                                                     sd->dI.dy,
-                                                     randu,
-                                                     randv,
-                                                     eval,
-                                                     omega_in,
-                                                     &domega_in->dx,
-                                                     &domega_in->dy,
-                                                     pdf,
-                                                     &sd->lcg_state);
+      label = bsdf_microfacet_multi_ggx_glass_sample(
+          kg, sc, Ng, sd->I, randu, randv, eval, omega_in, pdf, &sd->lcg_state);
       break;
     case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
     case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-      label = bsdf_microfacet_beckmann_sample(kg,
-                                              sc,
-                                              Ng,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
+      label = bsdf_microfacet_beckmann_sample(
+          kg, sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      label = bsdf_ashikhmin_shirley_sample(sc,
-                                            Ng,
-                                            sd->I,
-                                            sd->dI.dx,
-                                            sd->dI.dy,
-                                            randu,
-                                            randv,
-                                            eval,
-                                            omega_in,
-                                            &domega_in->dx,
-                                            &domega_in->dy,
-                                            pdf);
+      label = bsdf_ashikhmin_shirley_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-      label = bsdf_ashikhmin_velvet_sample(sc,
-                                           Ng,
-                                           sd->I,
-                                           sd->dI.dx,
-                                           sd->dI.dy,
-                                           randu,
-                                           randv,
-                                           eval,
-                                           omega_in,
-                                           &domega_in->dx,
-                                           &domega_in->dy,
-                                           pdf);
+      label = bsdf_ashikhmin_velvet_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-      label = bsdf_diffuse_toon_sample(sc,
-                                       Ng,
-                                       sd->I,
-                                       sd->dI.dx,
-                                       sd->dI.dy,
-                                       randu,
-                                       randv,
-                                       eval,
-                                       omega_in,
-                                       &domega_in->dx,
-                                       &domega_in->dy,
-                                       pdf);
+      label = bsdf_diffuse_toon_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_GLOSSY_TOON_ID:
-      label = bsdf_glossy_toon_sample(sc,
-                                      Ng,
-                                      sd->I,
-                                      sd->dI.dx,
-                                      sd->dI.dy,
-                                      randu,
-                                      randv,
-                                      eval,
-                                      omega_in,
-                                      &domega_in->dx,
-                                      &domega_in->dy,
-                                      pdf);
+      label = bsdf_glossy_toon_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-      label = bsdf_hair_reflection_sample(sc,
-                                          Ng,
-                                          sd->I,
-                                          sd->dI.dx,
-                                          sd->dI.dy,
-                                          randu,
-                                          randv,
-                                          eval,
-                                          omega_in,
-                                          &domega_in->dx,
-                                          &domega_in->dy,
-                                          pdf);
+      label = bsdf_hair_reflection_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-      label = bsdf_hair_transmission_sample(sc,
-                                            Ng,
-                                            sd->I,
-                                            sd->dI.dx,
-                                            sd->dI.dy,
-                                            randu,
-                                            randv,
-                                            eval,
-                                            omega_in,
-                                            &domega_in->dx,
-                                            &domega_in->dy,
-                                            pdf);
+      label = bsdf_hair_transmission_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
-      label = bsdf_principled_hair_sample(
-          kg, sc, sd, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+      label = bsdf_principled_hair_sample(kg, sc, sd, randu, randv, eval, omega_in, pdf);
       break;
-#  ifdef __PRINCIPLED__
     case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
-      label = bsdf_principled_diffuse_sample(sc,
-                                             Ng,
-                                             sd->I,
-                                             sd->dI.dx,
-                                             sd->dI.dy,
-                                             randu,
-                                             randv,
-                                             eval,
-                                             omega_in,
-                                             &domega_in->dx,
-                                             &domega_in->dy,
-                                             pdf);
+      label = bsdf_principled_diffuse_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
     case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
-      label = bsdf_principled_sheen_sample(sc,
-                                           Ng,
-                                           sd->I,
-                                           sd->dI.dx,
-                                           sd->dI.dy,
-                                           randu,
-                                           randv,
-                                           eval,
-                                           omega_in,
-                                           &domega_in->dx,
-                                           &domega_in->dy,
-                                           pdf);
+      label = bsdf_principled_sheen_sample(sc, Ng, sd->I, randu, randv, eval, omega_in, pdf);
       break;
-#  endif /* __PRINCIPLED__ */
 #endif
     default:
       label = LABEL_NONE;
@@ -434,12 +207,12 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
   else {
     /* Shadow terminator offset. */
     const float frequency_multiplier =
-        kernel_tex_fetch(__objects, sd->object).shadow_terminator_shading_offset;
+        kernel_data_fetch(objects, sd->object).shadow_terminator_shading_offset;
     if (frequency_multiplier > 1.0f) {
       *eval *= shift_cos_in(dot(*omega_in, sc->N), frequency_multiplier);
     }
     if (label & LABEL_DIFFUSE) {
-      if (!isequal_float3(sc->N, sd->N)) {
+      if (!isequal(sc->N, sd->N)) {
         *eval *= bump_shadowing_term((label & LABEL_TRANSMIT) ? -sd->N : sd->N, sc->N, *omega_in);
       }
     }
@@ -458,7 +231,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    float3
+    Spectrum
     bsdf_eval(KernelGlobals kg,
               ccl_private ShaderData *sd,
               ccl_private const ShaderClosure *sc,
@@ -466,14 +239,14 @@ ccl_device_inline
               const bool is_transmission,
               ccl_private float *pdf)
 {
-  float3 eval = zero_float3();
+  Spectrum eval = zero_spectrum();
 
   if (!is_transmission) {
     switch (sc->type) {
       case CLOSURE_BSDF_DIFFUSE_ID:
         eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
-#ifdef __SVM__
+#if defined(__SVM__) || defined(__OSL__)
       case CLOSURE_BSDF_OREN_NAYAR_ID:
         eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
@@ -537,26 +310,24 @@ ccl_device_inline
       case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
         eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
-#  ifdef __PRINCIPLED__
       case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
         eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
       case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
         eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
         break;
-#  endif /* __PRINCIPLED__ */
 #endif
       default:
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
-      if (!isequal_float3(sc->N, sd->N)) {
+      if (!isequal(sc->N, sd->N)) {
         eval *= bump_shadowing_term(sd->N, sc->N, omega_in);
       }
     }
     /* Shadow terminator offset. */
     const float frequency_multiplier =
-        kernel_tex_fetch(__objects, sd->object).shadow_terminator_shading_offset;
+        kernel_data_fetch(objects, sd->object).shadow_terminator_shading_offset;
     if (frequency_multiplier > 1.0f) {
       eval *= shift_cos_in(dot(omega_in, sc->N), frequency_multiplier);
     }
@@ -566,7 +337,7 @@ ccl_device_inline
       case CLOSURE_BSDF_DIFFUSE_ID:
         eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
-#ifdef __SVM__
+#if defined(__SVM__) || defined(__OSL__)
       case CLOSURE_BSDF_OREN_NAYAR_ID:
         eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
@@ -622,20 +393,18 @@ ccl_device_inline
       case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
         eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
-#  ifdef __PRINCIPLED__
       case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
         eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
       case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
         eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
         break;
-#  endif /* __PRINCIPLED__ */
 #endif
       default:
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
-      if (!isequal_float3(sc->N, sd->N)) {
+      if (!isequal(sc->N, sd->N)) {
         eval *= bump_shadowing_term(-sd->N, sc->N, omega_in);
       }
     }
@@ -650,7 +419,7 @@ ccl_device_inline
 ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float roughness)
 {
   /* TODO: do we want to blur volume closures? */
-#ifdef __SVM__
+#if defined(__SVM__) || defined(__OSL__)
   switch (sc->type) {
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
     case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 47066542122..75995262030 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -39,7 +39,7 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
   return 2.0f / (roughness * roughness) - 2.0f;
 }
 
-ccl_device_forceinline float3
+ccl_device_forceinline Spectrum
 bsdf_ashikhmin_shirley_eval_reflect(ccl_private const ShaderClosure *sc,
                                     const float3 I,
                                     const float3 omega_in,
@@ -55,7 +55,7 @@ bsdf_ashikhmin_shirley_eval_reflect(ccl_private const ShaderClosure *sc,
 
   if (fmaxf(bsdf->alpha_x, bsdf->alpha_y) <= 1e-4f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
   if (NdotI > 0.0f && NdotO > 0.0f) {
     NdotI = fmaxf(NdotI, 1e-6f);
@@ -105,16 +105,16 @@ bsdf_ashikhmin_shirley_eval_reflect(ccl_private const ShaderClosure *sc,
     }
   }
 
-  return make_float3(out, out, out);
+  return make_spectrum(out);
 }
 
-ccl_device float3 bsdf_ashikhmin_shirley_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                       const float3 I,
-                                                       const float3 omega_in,
-                                                       ccl_private float *pdf)
+ccl_device Spectrum bsdf_ashikhmin_shirley_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                         const float3 I,
+                                                         const float3 omega_in,
+                                                         ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device_inline void bsdf_ashikhmin_shirley_sample_first_quadrant(float n_x,
@@ -133,14 +133,10 @@ ccl_device_inline void bsdf_ashikhmin_shirley_sample_first_quadrant(float n_x,
 ccl_device int bsdf_ashikhmin_shirley_sample(ccl_private const ShaderClosure *sc,
                                              float3 Ng,
                                              float3 I,
-                                             float3 dIdx,
-                                             float3 dIdy,
                                              float randu,
                                              float randv,
-                                             ccl_private float3 *eval,
+                                             ccl_private Spectrum *eval,
                                              ccl_private float3 *omega_in,
-                                             ccl_private float3 *domega_in_dx,
-                                             ccl_private float3 *domega_in_dy,
                                              ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
@@ -214,19 +210,13 @@ ccl_device int bsdf_ashikhmin_shirley_sample(ccl_private const ShaderClosure *sc
     if (fmaxf(bsdf->alpha_x, bsdf->alpha_y) <= 1e-4f) {
       /* Some high number for MIS. */
       *pdf = 1e6f;
-      *eval = make_float3(1e6f, 1e6f, 1e6f);
+      *eval = make_spectrum(1e6f);
       label = LABEL_REFLECT | LABEL_SINGULAR;
     }
     else {
       /* leave the rest to eval_reflect */
       *eval = bsdf_ashikhmin_shirley_eval_reflect(sc, I, *omega_in, pdf);
     }
-
-#ifdef __RAY_DIFFERENTIALS__
-    /* just do the reflection thing for now */
-    *domega_in_dx = (2.0f * dot(N, dIdx)) * N - dIdx;
-    *domega_in_dy = (2.0f * dot(N, dIdy)) * N - dIdy;
-#endif
   }
 
   return label;
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3d7906eef7d..9e68ea5d5e5 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -31,10 +31,10 @@ ccl_device int bsdf_ashikhmin_velvet_setup(ccl_private VelvetBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                     const float3 I,
-                                                     const float3 omega_in,
-                                                     ccl_private float *pdf)
+ccl_device Spectrum bsdf_ashikhmin_velvet_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                       const float3 I,
+                                                       const float3 omega_in,
+                                                       ccl_private float *pdf)
 {
   ccl_private const VelvetBsdf *bsdf = (ccl_private const VelvetBsdf *)sc;
   float m_invsigma2 = bsdf->invsigma2;
@@ -50,7 +50,7 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(ccl_private const ShaderClo
 
     if (!(fabsf(cosNH) < 1.0f - 1e-5f && cosHO > 1e-5f)) {
       *pdf = 0.0f;
-      return make_float3(0.0f, 0.0f, 0.0f);
+      return zero_spectrum();
     }
     float cosNHdivHO = cosNH / cosHO;
     cosNHdivHO = fmaxf(cosNHdivHO, 1e-5f);
@@ -68,33 +68,29 @@ ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(ccl_private const ShaderClo
     float out = 0.25f * (D * G) / cosNO;
 
     *pdf = 0.5f * M_1_PI_F;
-    return make_float3(out, out, out);
+    return make_spectrum(out);
   }
 
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_ashikhmin_velvet_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                      const float3 I,
-                                                      const float3 omega_in,
-                                                      ccl_private float *pdf)
+ccl_device Spectrum bsdf_ashikhmin_velvet_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                        const float3 I,
+                                                        const float3 omega_in,
+                                                        ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_ashikhmin_velvet_sample(ccl_private const ShaderClosure *sc,
                                             float3 Ng,
                                             float3 I,
-                                            float3 dIdx,
-                                            float3 dIdy,
                                             float randu,
                                             float randv,
-                                            ccl_private float3 *eval,
+                                            ccl_private Spectrum *eval,
                                             ccl_private float3 *omega_in,
-                                            ccl_private float3 *domega_in_dx,
-                                            ccl_private float3 *domega_in_dy,
                                             ccl_private float *pdf)
 {
   ccl_private const VelvetBsdf *bsdf = (ccl_private const VelvetBsdf *)sc;
@@ -129,22 +125,16 @@ ccl_device int bsdf_ashikhmin_velvet_sample(ccl_private const ShaderClosure *sc,
 
       float power = 0.25f * (D * G) / cosNO;
 
-      *eval = make_float3(power, power, power);
-
-#ifdef __RAY_DIFFERENTIALS__
-      // TODO: find a better approximation for the retroreflective bounce
-      *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
-      *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
+      *eval = make_spectrum(power);
     }
     else {
       *pdf = 0.0f;
-      *eval = make_float3(0.0f, 0.0f, 0.0f);
+      *eval = zero_spectrum();
     }
   }
   else {
     *pdf = 0.0f;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
   return LABEL_REFLECT | LABEL_DIFFUSE;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index 759ad03f8e8..ec64c375666 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -26,39 +26,35 @@ ccl_device int bsdf_diffuse_setup(ccl_private DiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_diffuse_eval_reflect(ccl_private const ShaderClosure *sc,
-                                            const float3 I,
-                                            const float3 omega_in,
-                                            ccl_private float *pdf)
+ccl_device Spectrum bsdf_diffuse_eval_reflect(ccl_private const ShaderClosure *sc,
+                                              const float3 I,
+                                              const float3 omega_in,
+                                              ccl_private float *pdf)
 {
   ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
   float3 N = bsdf->N;
 
   float cos_pi = fmaxf(dot(N, omega_in), 0.0f) * M_1_PI_F;
   *pdf = cos_pi;
-  return make_float3(cos_pi, cos_pi, cos_pi);
+  return make_spectrum(cos_pi);
 }
 
-ccl_device float3 bsdf_diffuse_eval_transmit(ccl_private const ShaderClosure *sc,
-                                             const float3 I,
-                                             const float3 omega_in,
-                                             ccl_private float *pdf)
+ccl_device Spectrum bsdf_diffuse_eval_transmit(ccl_private const ShaderClosure *sc,
+                                               const float3 I,
+                                               const float3 omega_in,
+                                               ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_diffuse_sample(ccl_private const ShaderClosure *sc,
                                    float3 Ng,
                                    float3 I,
-                                   float3 dIdx,
-                                   float3 dIdy,
                                    float randu,
                                    float randv,
-                                   ccl_private float3 *eval,
+                                   ccl_private Spectrum *eval,
                                    ccl_private float3 *omega_in,
-                                   ccl_private float3 *domega_in_dx,
-                                   ccl_private float3 *domega_in_dy,
                                    ccl_private float *pdf)
 {
   ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
@@ -68,16 +64,11 @@ ccl_device int bsdf_diffuse_sample(ccl_private const ShaderClosure *sc,
   sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
 
   if (dot(Ng, *omega_in) > 0.0f) {
-    *eval = make_float3(*pdf, *pdf, *pdf);
-#ifdef __RAY_DIFFERENTIALS__
-    // TODO: find a better approximation for the diffuse bounce
-    *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
-    *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#endif
+    *eval = make_spectrum(*pdf);
   }
   else {
     *pdf = 0.0f;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
   return LABEL_REFLECT | LABEL_DIFFUSE;
 }
@@ -90,39 +81,35 @@ ccl_device int bsdf_translucent_setup(ccl_private DiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_translucent_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                const float3 I,
-                                                const float3 omega_in,
-                                                ccl_private float *pdf)
+ccl_device Spectrum bsdf_translucent_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                  const float3 I,
+                                                  const float3 omega_in,
+                                                  ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_translucent_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                 const float3 I,
-                                                 const float3 omega_in,
-                                                 ccl_private float *pdf)
+ccl_device Spectrum bsdf_translucent_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                   const float3 I,
+                                                   const float3 omega_in,
+                                                   ccl_private float *pdf)
 {
   ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
   float3 N = bsdf->N;
 
   float cos_pi = fmaxf(-dot(N, omega_in), 0.0f) * M_1_PI_F;
   *pdf = cos_pi;
-  return make_float3(cos_pi, cos_pi, cos_pi);
+  return make_spectrum(cos_pi);
 }
 
 ccl_device int bsdf_translucent_sample(ccl_private const ShaderClosure *sc,
                                        float3 Ng,
                                        float3 I,
-                                       float3 dIdx,
-                                       float3 dIdy,
                                        float randu,
                                        float randv,
-                                       ccl_private float3 *eval,
+                                       ccl_private Spectrum *eval,
                                        ccl_private float3 *omega_in,
-                                       ccl_private float3 *domega_in_dx,
-                                       ccl_private float3 *domega_in_dy,
                                        ccl_private float *pdf)
 {
   ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
@@ -132,16 +119,11 @@ ccl_device int bsdf_translucent_sample(ccl_private const ShaderClosure *sc,
   // distribution over the hemisphere
   sample_cos_hemisphere(-N, randu, randv, omega_in, pdf);
   if (dot(Ng, *omega_in) < 0) {
-    *eval = make_float3(*pdf, *pdf, *pdf);
-#ifdef __RAY_DIFFERENTIALS__
-    // TODO: find a better approximation for the diffuse bounce
-    *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
-    *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
-#endif
+    *eval = make_spectrum(*pdf);
   }
   else {
     *pdf = 0;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
   return LABEL_TRANSMIT | LABEL_DIFFUSE;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index aa4c091f587..d7faf5c9e9a 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -9,6 +9,7 @@
 #pragma once
 
 #include "kernel/sample/mapping.h"
+#include "kernel/util/color.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,38 +47,34 @@ ccl_device void bsdf_diffuse_ramp_blur(ccl_private ShaderClosure *sc, float roug
 {
 }
 
-ccl_device float3 bsdf_diffuse_ramp_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                 const float3 I,
-                                                 const float3 omega_in,
-                                                 ccl_private float *pdf)
+ccl_device Spectrum bsdf_diffuse_ramp_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                   const float3 I,
+                                                   const float3 omega_in,
+                                                   ccl_private float *pdf)
 {
   const DiffuseRampBsdf *bsdf = (const DiffuseRampBsdf *)sc;
   float3 N = bsdf->N;
 
   float cos_pi = fmaxf(dot(N, omega_in), 0.0f);
   *pdf = cos_pi * M_1_PI_F;
-  return bsdf_diffuse_ramp_get_color(bsdf->colors, cos_pi) * M_1_PI_F;
+  return rgb_to_spectrum(bsdf_diffuse_ramp_get_color(bsdf->colors, cos_pi) * M_1_PI_F);
 }
 
-ccl_device float3 bsdf_diffuse_ramp_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                  const float3 I,
-                                                  const float3 omega_in,
-                                                  ccl_private float *pdf)
+ccl_device Spectrum bsdf_diffuse_ramp_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                    const float3 I,
+                                                    const float3 omega_in,
+                                                    ccl_private float *pdf)
 {
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_diffuse_ramp_sample(ccl_private const ShaderClosure *sc,
                                         float3 Ng,
                                         float3 I,
-                                        float3 dIdx,
-                                        float3 dIdy,
                                         float randu,
                                         float randv,
-                                        ccl_private float3 *eval,
+                                        ccl_private Spectrum *eval,
                                         ccl_private float3 *omega_in,
-                                        ccl_private float3 *domega_in_dx,
-                                        ccl_private float3 *domega_in_dy,
                                         ccl_private float *pdf)
 {
   const DiffuseRampBsdf *bsdf = (const DiffuseRampBsdf *)sc;
@@ -87,15 +84,11 @@ ccl_device int bsdf_diffuse_ramp_sample(ccl_private const ShaderClosure *sc,
   sample_cos_hemisphere(N, randu, randv, omega_in, pdf);
 
   if (dot(Ng, *omega_in) > 0.0f) {
-    *eval = bsdf_diffuse_ramp_get_color(bsdf->colors, *pdf * M_PI_F) * M_1_PI_F;
-#  ifdef __RAY_DIFFERENTIALS__
-    *domega_in_dx = (2 * dot(N, dIdx)) * N - dIdx;
-    *domega_in_dy = (2 * dot(N, dIdy)) * N - dIdy;
-#  endif
+    *eval = rgb_to_spectrum(bsdf_diffuse_ramp_get_color(bsdf->colors, *pdf * M_PI_F) * M_1_PI_F);
   }
   else {
     *pdf = 0.0f;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
   return LABEL_REFLECT | LABEL_DIFFUSE;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index a136ed05800..a29f7c444ae 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -37,10 +37,10 @@ ccl_device int bsdf_hair_transmission_setup(ccl_private HairBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_hair_reflection_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                    const float3 I,
-                                                    const float3 omega_in,
-                                                    ccl_private float *pdf)
+ccl_device Spectrum bsdf_hair_reflection_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                      const float3 I,
+                                                      const float3 omega_in,
+                                                      ccl_private float *pdf)
 {
   ccl_private const HairBsdf *bsdf = (ccl_private const HairBsdf *)sc;
   float offset = bsdf->offset;
@@ -61,7 +61,7 @@ ccl_device float3 bsdf_hair_reflection_eval_reflect(ccl_private const ShaderClos
 
   if (M_PI_2_F - fabsf(theta_i) < 0.001f || cosphi_i < 0.0f) {
     *pdf = 0.0f;
-    return make_float3(*pdf, *pdf, *pdf);
+    return zero_spectrum();
   }
 
   float roughness1_inv = 1.0f / roughness1;
@@ -81,31 +81,31 @@ ccl_device float3 bsdf_hair_reflection_eval_reflect(ccl_private const ShaderClos
                     (2 * (t * t + roughness1 * roughness1) * (a_R - b_R) * costheta_i);
   *pdf = phi_pdf * theta_pdf;
 
-  return make_float3(*pdf, *pdf, *pdf);
+  return make_spectrum(*pdf);
 }
 
-ccl_device float3 bsdf_hair_transmission_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                      const float3 I,
-                                                      const float3 omega_in,
-                                                      ccl_private float *pdf)
+ccl_device Spectrum bsdf_hair_transmission_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                        const float3 I,
+                                                        const float3 omega_in,
+                                                        ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_hair_reflection_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                     const float3 I,
-                                                     const float3 omega_in,
-                                                     ccl_private float *pdf)
+ccl_device Spectrum bsdf_hair_reflection_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                       const float3 I,
+                                                       const float3 omega_in,
+                                                       ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_hair_transmission_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                       const float3 I,
-                                                       const float3 omega_in,
-                                                       ccl_private float *pdf)
+ccl_device Spectrum bsdf_hair_transmission_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                         const float3 I,
+                                                         const float3 omega_in,
+                                                         ccl_private float *pdf)
 {
   ccl_private const HairBsdf *bsdf = (ccl_private const HairBsdf *)sc;
   float offset = bsdf->offset;
@@ -125,7 +125,7 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(ccl_private const ShaderC
 
   if (M_PI_2_F - fabsf(theta_i) < 0.001f) {
     *pdf = 0.0f;
-    return make_float3(*pdf, *pdf, *pdf);
+    return zero_spectrum();
   }
 
   float costheta_i = fast_cosf(theta_i);
@@ -145,20 +145,16 @@ ccl_device float3 bsdf_hair_transmission_eval_transmit(ccl_private const ShaderC
   float phi_pdf = roughness2 / (c_TT * (p * p + roughness2 * roughness2));
 
   *pdf = phi_pdf * theta_pdf;
-  return make_float3(*pdf, *pdf, *pdf);
+  return make_spectrum(*pdf);
 }
 
 ccl_device int bsdf_hair_reflection_sample(ccl_private const ShaderClosure *sc,
                                            float3 Ng,
                                            float3 I,
-                                           float3 dIdx,
-                                           float3 dIdy,
                                            float randu,
                                            float randv,
-                                           ccl_private float3 *eval,
+                                           ccl_private Spectrum *eval,
                                            ccl_private float3 *omega_in,
-                                           ccl_private float3 *domega_in_dx,
-                                           ccl_private float3 *domega_in_dy,
                                            ccl_private float *pdf)
 {
   ccl_private const HairBsdf *bsdf = (ccl_private const HairBsdf *)sc;
@@ -194,17 +190,11 @@ ccl_device int bsdf_hair_reflection_sample(ccl_private const ShaderClosure *sc,
   fast_sincosf(phi, &sinphi, &cosphi);
   *omega_in = (cosphi * costheta_i) * locy - (sinphi * costheta_i) * locx + (sintheta_i)*Tg;
 
-  // differentials - TODO: find a better approximation for the reflective bounce
-#ifdef __RAY_DIFFERENTIALS__
-  *domega_in_dx = 2 * dot(locy, dIdx) * locy - dIdx;
-  *domega_in_dy = 2 * dot(locy, dIdy) * locy - dIdy;
-#endif
-
   *pdf = fabsf(phi_pdf * theta_pdf);
   if (M_PI_2_F - fabsf(theta_i) < 0.001f)
     *pdf = 0.0f;
 
-  *eval = make_float3(*pdf, *pdf, *pdf);
+  *eval = make_spectrum(*pdf);
 
   return LABEL_REFLECT | LABEL_GLOSSY;
 }
@@ -212,14 +202,10 @@ ccl_device int bsdf_hair_reflection_sample(ccl_private const ShaderClosure *sc,
 ccl_device int bsdf_hair_transmission_sample(ccl_private const ShaderClosure *sc,
                                              float3 Ng,
                                              float3 I,
-                                             float3 dIdx,
-                                             float3 dIdy,
                                              float randu,
                                              float randv,
-                                             ccl_private float3 *eval,
+                                             ccl_private Spectrum *eval,
                                              ccl_private float3 *omega_in,
-                                             ccl_private float3 *domega_in_dx,
-                                             ccl_private float3 *domega_in_dy,
                                              ccl_private float *pdf)
 {
   ccl_private const HairBsdf *bsdf = (ccl_private const HairBsdf *)sc;
@@ -255,18 +241,12 @@ ccl_device int bsdf_hair_transmission_sample(ccl_private const ShaderClosure *sc
   fast_sincosf(phi, &sinphi, &cosphi);
   *omega_in = (cosphi * costheta_i) * locy - (sinphi * costheta_i) * locx + (sintheta_i)*Tg;
 
-  // differentials - TODO: find a better approximation for the transmission bounce
-#ifdef __RAY_DIFFERENTIALS__
-  *domega_in_dx = 2 * dot(locy, dIdx) * locy - dIdx;
-  *domega_in_dy = 2 * dot(locy, dIdy) * locy - dIdy;
-#endif
-
   *pdf = fabsf(phi_pdf * theta_pdf);
   if (M_PI_2_F - fabsf(theta_i) < 0.001f) {
     *pdf = 0.0f;
   }
 
-  *eval = make_float3(*pdf, *pdf, *pdf);
+  *eval = make_spectrum(*pdf);
 
   /* TODO(sergey): Should always be negative, but seems some precision issue
    * is involved here.
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index 33706213403..2236bc62050 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  include <fenv.h>
 #endif
 
@@ -20,7 +20,7 @@ typedef struct PrincipledHairBSDF {
   SHADER_CLOSURE_BASE;
 
   /* Absorption coefficient. */
-  float3 sigma;
+  Spectrum sigma;
   /* Variance of the underlying logistic distribution. */
   float v;
   /* Scale factor of the underlying logistic distribution. */
@@ -166,12 +166,6 @@ ccl_device_inline float longitudinal_scattering(
   }
 }
 
-/* Combine the three values using their luminances. */
-ccl_device_inline float4 combine_with_energy(KernelGlobals kg, float3 c)
-{
-  return make_float4(c.x, c.y, c.z, linear_rgb_to_gray(kg, c));
-}
-
 #ifdef __HAIR__
 /* Set up the hair closure. */
 ccl_device int bsdf_principled_hair_setup(ccl_private ShaderData *sd,
@@ -203,7 +197,7 @@ ccl_device int bsdf_principled_hair_setup(ccl_private ShaderData *sd,
   float h = (sd->type & PRIMITIVE_CURVE_RIBBON) ? -sd->v : dot(cross(sd->Ng, X), Z);
 
   kernel_assert(fabsf(h) < 1.0f + 1e-4f);
-  kernel_assert(isfinite3_safe(Y));
+  kernel_assert(isfinite_safe(Y));
   kernel_assert(isfinite_safe(h));
 
   bsdf->extra->geom = make_float4(Y.x, Y.y, Y.z, h);
@@ -214,34 +208,36 @@ ccl_device int bsdf_principled_hair_setup(ccl_private ShaderData *sd,
 #endif /* __HAIR__ */
 
 /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
-ccl_device_inline void hair_attenuation(KernelGlobals kg,
-                                        float f,
-                                        float3 T,
-                                        ccl_private float4 *Ap)
+ccl_device_inline void hair_attenuation(
+    KernelGlobals kg, float f, Spectrum T, ccl_private Spectrum *Ap, ccl_private float *Ap_energy)
 {
   /* Primary specular (R). */
-  Ap[0] = make_float4(f, f, f, f);
+  Ap[0] = make_spectrum(f);
+  Ap_energy[0] = f;
 
   /* Transmission (TT). */
-  float3 col = sqr(1.0f - f) * T;
-  Ap[1] = combine_with_energy(kg, col);
+  Spectrum col = sqr(1.0f - f) * T;
+  Ap[1] = col;
+  Ap_energy[1] = spectrum_to_gray(kg, col);
 
   /* Secondary specular (TRT). */
   col *= T * f;
-  Ap[2] = combine_with_energy(kg, col);
+  Ap[2] = col;
+  Ap_energy[2] = spectrum_to_gray(kg, col);
 
   /* Residual component (TRRT+). */
-  col *= safe_divide_color(T * f, make_float3(1.0f, 1.0f, 1.0f) - T * f);
-  Ap[3] = combine_with_energy(kg, col);
+  col *= safe_divide(T * f, one_spectrum() - T * f);
+  Ap[3] = col;
+  Ap_energy[3] = spectrum_to_gray(kg, col);
 
   /* Normalize sampling weights. */
-  float totweight = Ap[0].w + Ap[1].w + Ap[2].w + Ap[3].w;
+  float totweight = Ap_energy[0] + Ap_energy[1] + Ap_energy[2] + Ap_energy[3];
   float fac = safe_divide(1.0f, totweight);
 
-  Ap[0].w *= fac;
-  Ap[1].w *= fac;
-  Ap[2].w *= fac;
-  Ap[3].w *= fac;
+  Ap_energy[0] *= fac;
+  Ap_energy[1] *= fac;
+  Ap_energy[2] *= fac;
+  Ap_energy[3] *= fac;
 }
 
 /* Given the tilt angle, generate the rotated theta_i for the different bounces. */
@@ -266,13 +262,13 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i,
 }
 
 /* Evaluation function for our shader. */
-ccl_device float3 bsdf_principled_hair_eval(KernelGlobals kg,
-                                            ccl_private const ShaderData *sd,
-                                            ccl_private const ShaderClosure *sc,
-                                            const float3 omega_in,
-                                            ccl_private float *pdf)
+ccl_device Spectrum bsdf_principled_hair_eval(KernelGlobals kg,
+                                              ccl_private const ShaderData *sd,
+                                              ccl_private const ShaderClosure *sc,
+                                              const float3 omega_in,
+                                              ccl_private float *pdf)
 {
-  kernel_assert(isfinite3_safe(sd->P) && isfinite_safe(sd->ray_length));
+  kernel_assert(isfinite_safe(sd->P) && isfinite_safe(sd->ray_length));
 
   ccl_private const PrincipledHairBSDF *bsdf = (ccl_private const PrincipledHairBSDF *)sc;
   float3 Y = float4_to_float3(bsdf->extra->geom);
@@ -299,9 +295,11 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals kg,
   float cos_gamma_t = cos_from_sin(sin_gamma_t);
   float gamma_t = safe_asinf(sin_gamma_t);
 
-  float3 T = exp3(-bsdf->sigma * (2.0f * cos_gamma_t / cos_theta_t));
-  float4 Ap[4];
-  hair_attenuation(kg, fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta), T, Ap);
+  Spectrum T = exp(-bsdf->sigma * (2.0f * cos_gamma_t / cos_theta_t));
+  Spectrum Ap[4];
+  float Ap_energy[4];
+  hair_attenuation(
+      kg, fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta), T, Ap, Ap_energy);
 
   float sin_theta_i = wi.x;
   float cos_theta_i = cos_from_sin(sin_theta_i);
@@ -312,35 +310,40 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals kg,
   float angles[6];
   hair_alpha_angles(sin_theta_i, cos_theta_i, bsdf->alpha, angles);
 
-  float4 F;
+  Spectrum F;
+  float F_energy;
   float Mp, Np;
 
   /* Primary specular (R). */
   Mp = longitudinal_scattering(angles[0], angles[1], sin_theta_o, cos_theta_o, bsdf->m0_roughness);
   Np = azimuthal_scattering(phi, 0, bsdf->s, gamma_o, gamma_t);
   F = Ap[0] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy = Ap_energy[0] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
   /* Transmission (TT). */
   Mp = longitudinal_scattering(angles[2], angles[3], sin_theta_o, cos_theta_o, 0.25f * bsdf->v);
   Np = azimuthal_scattering(phi, 1, bsdf->s, gamma_o, gamma_t);
   F += Ap[1] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy += Ap_energy[1] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
   /* Secondary specular (TRT). */
   Mp = longitudinal_scattering(angles[4], angles[5], sin_theta_o, cos_theta_o, 4.0f * bsdf->v);
   Np = azimuthal_scattering(phi, 2, bsdf->s, gamma_o, gamma_t);
   F += Ap[2] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy += Ap_energy[2] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
   /* Residual component (TRRT+). */
   Mp = longitudinal_scattering(sin_theta_i, cos_theta_i, sin_theta_o, cos_theta_o, 4.0f * bsdf->v);
   Np = M_1_2PI_F;
   F += Ap[3] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy += Ap_energy[3] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
-  *pdf = F.w;
-  return float4_to_float3(F);
+  *pdf = F_energy;
+  return F;
 }
 
 /* Sampling function for the hair shader. */
@@ -349,10 +352,8 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals kg,
                                            ccl_private ShaderData *sd,
                                            float randu,
                                            float randv,
-                                           ccl_private float3 *eval,
+                                           ccl_private Spectrum *eval,
                                            ccl_private float3 *omega_in,
-                                           ccl_private float3 *domega_in_dx,
-                                           ccl_private float3 *domega_in_dy,
                                            ccl_private float *pdf)
 {
   ccl_private PrincipledHairBSDF *bsdf = (ccl_private PrincipledHairBSDF *)sc;
@@ -385,16 +386,18 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals kg,
   float cos_gamma_t = cos_from_sin(sin_gamma_t);
   float gamma_t = safe_asinf(sin_gamma_t);
 
-  float3 T = exp3(-bsdf->sigma * (2.0f * cos_gamma_t / cos_theta_t));
-  float4 Ap[4];
-  hair_attenuation(kg, fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta), T, Ap);
+  Spectrum T = exp(-bsdf->sigma * (2.0f * cos_gamma_t / cos_theta_t));
+  Spectrum Ap[4];
+  float Ap_energy[4];
+  hair_attenuation(
+      kg, fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta), T, Ap, Ap_energy);
 
   int p = 0;
   for (; p < 3; p++) {
-    if (u[0].x < Ap[p].w) {
+    if (u[0].x < Ap_energy[p]) {
       break;
     }
-    u[0].x -= Ap[p].w;
+    u[0].x -= Ap_energy[p];
   }
 
   float v = bsdf->v;
@@ -429,44 +432,43 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals kg,
 
   hair_alpha_angles(sin_theta_i, cos_theta_i, bsdf->alpha, angles);
 
-  float4 F;
+  Spectrum F;
+  float F_energy;
   float Mp, Np;
 
   /* Primary specular (R). */
   Mp = longitudinal_scattering(angles[0], angles[1], sin_theta_o, cos_theta_o, bsdf->m0_roughness);
   Np = azimuthal_scattering(phi, 0, bsdf->s, gamma_o, gamma_t);
   F = Ap[0] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy = Ap_energy[0] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
   /* Transmission (TT). */
   Mp = longitudinal_scattering(angles[2], angles[3], sin_theta_o, cos_theta_o, 0.25f * bsdf->v);
   Np = azimuthal_scattering(phi, 1, bsdf->s, gamma_o, gamma_t);
   F += Ap[1] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy += Ap_energy[1] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
   /* Secondary specular (TRT). */
   Mp = longitudinal_scattering(angles[4], angles[5], sin_theta_o, cos_theta_o, 4.0f * bsdf->v);
   Np = azimuthal_scattering(phi, 2, bsdf->s, gamma_o, gamma_t);
   F += Ap[2] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy += Ap_energy[2] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
   /* Residual component (TRRT+). */
   Mp = longitudinal_scattering(sin_theta_i, cos_theta_i, sin_theta_o, cos_theta_o, 4.0f * bsdf->v);
   Np = M_1_2PI_F;
   F += Ap[3] * Mp * Np;
-  kernel_assert(isfinite3_safe(float4_to_float3(F)));
+  F_energy += Ap_energy[3] * Mp * Np;
+  kernel_assert(isfinite_safe(F) && isfinite_safe(F_energy));
 
-  *eval = float4_to_float3(F);
-  *pdf = F.w;
+  *eval = F;
+  *pdf = F_energy;
 
   *omega_in = X * sin_theta_i + Y * cos_theta_i * cosf(phi_i) + Z * cos_theta_i * sinf(phi_i);
 
-#ifdef __RAY_DIFFERENTIALS__
-  float3 N = safe_normalize(sd->I + *omega_in);
-  *domega_in_dx = (2 * dot(N, sd->dI.dx)) * N - sd->dI.dx;
-  *domega_in_dy = (2 * dot(N, sd->dI.dy)) * N - sd->dI.dy;
-#endif
-
   return LABEL_GLOSSY | ((p == 0) ? LABEL_REFLECT : LABEL_TRANSMIT);
 }
 
@@ -489,25 +491,28 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
   return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }
 
-ccl_device float3 bsdf_principled_hair_albedo(ccl_private const ShaderClosure *sc)
+ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderClosure *sc)
 {
   ccl_private PrincipledHairBSDF *bsdf = (ccl_private PrincipledHairBSDF *)sc;
-  return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
+  return exp(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
 }
 
-ccl_device_inline float3
-bsdf_principled_hair_sigma_from_reflectance(const float3 color, const float azimuthal_roughness)
+ccl_device_inline Spectrum
+bsdf_principled_hair_sigma_from_reflectance(const Spectrum color, const float azimuthal_roughness)
 {
-  const float3 sigma = log3(color) /
-                       bsdf_principled_hair_albedo_roughness_scale(azimuthal_roughness);
+  const Spectrum sigma = log(color) /
+                         bsdf_principled_hair_albedo_roughness_scale(azimuthal_roughness);
   return sigma * sigma;
 }
 
-ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const float eumelanin,
-                                                                       const float pheomelanin)
+ccl_device_inline Spectrum bsdf_principled_hair_sigma_from_concentration(const float eumelanin,
+                                                                         const float pheomelanin)
 {
-  return eumelanin * make_float3(0.506f, 0.841f, 1.653f) +
-         pheomelanin * make_float3(0.343f, 0.733f, 1.924f);
+  const float3 eumelanin_color = make_float3(0.506f, 0.841f, 1.653f);
+  const float3 pheomelanin_color = make_float3(0.343f, 0.733f, 1.924f);
+
+  return eumelanin * rgb_to_spectrum(eumelanin_color) +
+         pheomelanin * rgb_to_spectrum(pheomelanin_color);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index db50712f9f0..04d5ca90bfd 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -17,8 +17,8 @@
 CCL_NAMESPACE_BEGIN
 
 typedef struct MicrofacetExtra {
-  float3 color, cspec0;
-  float3 fresnel_color;
+  Spectrum color, cspec0;
+  Spectrum fresnel_color;
   float clearcoat;
 } MicrofacetExtra;
 
@@ -233,11 +233,11 @@ ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals kg,
  *
  * Else it is simply white
  */
-ccl_device_forceinline float3 reflection_color(ccl_private const MicrofacetBsdf *bsdf,
-                                               float3 L,
-                                               float3 H)
+ccl_device_forceinline Spectrum reflection_color(ccl_private const MicrofacetBsdf *bsdf,
+                                                 float3 L,
+                                                 float3 H)
 {
-  float3 F = make_float3(1.0f, 1.0f, 1.0f);
+  Spectrum F = one_spectrum();
   bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
                       bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
   if (use_fresnel) {
@@ -310,7 +310,7 @@ ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *b
 ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
                                                  ccl_private const ShaderData *sd)
 {
-  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
+  bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
 
   bsdf->alpha_x = saturatef(bsdf->alpha_x);
   bsdf->alpha_y = saturatef(bsdf->alpha_y);
@@ -325,7 +325,7 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd
 ccl_device int bsdf_microfacet_ggx_clearcoat_setup(ccl_private MicrofacetBsdf *bsdf,
                                                    ccl_private const ShaderData *sd)
 {
-  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
+  bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
 
   bsdf->alpha_x = saturatef(bsdf->alpha_x);
   bsdf->alpha_y = bsdf->alpha_x;
@@ -357,10 +357,10 @@ ccl_device void bsdf_microfacet_ggx_blur(ccl_private ShaderClosure *sc, float ro
   bsdf->alpha_y = fmaxf(roughness, bsdf->alpha_y);
 }
 
-ccl_device float3 bsdf_microfacet_ggx_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                   const float3 I,
-                                                   const float3 omega_in,
-                                                   ccl_private float *pdf)
+ccl_device Spectrum bsdf_microfacet_ggx_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                     const float3 I,
+                                                     const float3 omega_in,
+                                                     ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
   float alpha_x = bsdf->alpha_x;
@@ -370,7 +370,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(ccl_private const ShaderClosu
 
   if (m_refractive || alpha_x * alpha_y <= 1e-7f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   float cosNO = dot(N, I);
@@ -451,12 +451,12 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(ccl_private const ShaderClosu
     /* eq. 20 */
     float common = D * 0.25f / cosNO;
 
-    float3 F = reflection_color(bsdf, omega_in, m);
+    Spectrum F = reflection_color(bsdf, omega_in, m);
     if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
       F *= 0.25f * bsdf->extra->clearcoat;
     }
 
-    float3 out = F * G * common;
+    Spectrum out = F * G * common;
 
     /* eq. 2 in distribution of visible normals sampling
      * `pm = Dw = G1o * dot(m, I) * D / dot(N, I);` */
@@ -469,13 +469,13 @@ ccl_device float3 bsdf_microfacet_ggx_eval_reflect(ccl_private const ShaderClosu
     return out;
   }
 
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_microfacet_ggx_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                    const float3 I,
-                                                    const float3 omega_in,
-                                                    ccl_private float *pdf)
+ccl_device Spectrum bsdf_microfacet_ggx_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                      const float3 I,
+                                                      const float3 omega_in,
+                                                      ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
   float alpha_x = bsdf->alpha_x;
@@ -486,7 +486,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(ccl_private const ShaderClos
 
   if (!m_refractive || alpha_x * alpha_y <= 1e-7f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   float cosNO = dot(N, I);
@@ -494,7 +494,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(ccl_private const ShaderClos
 
   if (cosNO <= 0 || cosNI >= 0) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f); /* vectors on same side -- not possible */
+    return zero_spectrum(); /* vectors on same side -- not possible */
   }
   /* compute half-vector of the refraction (eq. 16) */
   float3 ht = -(m_eta * omega_in + I);
@@ -530,21 +530,17 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(ccl_private const ShaderClos
   float out = G * fabsf(cosHI * cosHO) * common;
   *pdf = G1o * fabsf(cosHO * cosHI) * common;
 
-  return make_float3(out, out, out);
+  return make_spectrum(out);
 }
 
 ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
                                           ccl_private const ShaderClosure *sc,
                                           float3 Ng,
                                           float3 I,
-                                          float3 dIdx,
-                                          float3 dIdy,
                                           float randu,
                                           float randv,
-                                          ccl_private float3 *eval,
+                                          ccl_private Spectrum *eval,
                                           ccl_private float3 *omega_in,
-                                          ccl_private float3 *domega_in_dx,
-                                          ccl_private float3 *domega_in_dy,
                                           ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
@@ -588,7 +584,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
           if (alpha_x * alpha_y <= 1e-7f) {
             /* some high number for MIS */
             *pdf = 1e6f;
-            *eval = make_float3(1e6f, 1e6f, 1e6f);
+            *eval = make_spectrum(1e6f);
 
             bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
                                 bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
@@ -664,7 +660,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
             float common = (G1o * D) * 0.25f / cosNO;
             *pdf = common;
 
-            float3 F = reflection_color(bsdf, *omega_in, m);
+            Spectrum F = reflection_color(bsdf, *omega_in, m);
 
             *eval = G1i * common * F;
           }
@@ -672,14 +668,9 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
           if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
             *eval *= 0.25f * bsdf->extra->clearcoat;
           }
-
-#ifdef __RAY_DIFFERENTIALS__
-          *domega_in_dx = (2 * dot(m, dIdx)) * m - dIdx;
-          *domega_in_dy = (2 * dot(m, dIdy)) * m - dIdy;
-#endif
         }
         else {
-          *eval = make_float3(0.0f, 0.0f, 0.0f);
+          *eval = zero_spectrum();
           *pdf = 0.0f;
         }
       }
@@ -690,39 +681,18 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
       /* CAUTION: the i and o variables are inverted relative to the paper
        * eq. 39 - compute actual refractive direction */
       float3 R, T;
-#ifdef __RAY_DIFFERENTIALS__
-      float3 dRdx, dRdy, dTdx, dTdy;
-#endif
       float m_eta = bsdf->ior, fresnel;
       bool inside;
 
-      fresnel = fresnel_dielectric(m_eta,
-                                   m,
-                                   I,
-                                   &R,
-                                   &T,
-#ifdef __RAY_DIFFERENTIALS__
-                                   dIdx,
-                                   dIdy,
-                                   &dRdx,
-                                   &dRdy,
-                                   &dTdx,
-                                   &dTdy,
-#endif
-                                   &inside);
+      fresnel = fresnel_dielectric(m_eta, m, I, &R, &T, &inside);
 
       if (!inside && fresnel != 1.0f) {
-
         *omega_in = T;
-#ifdef __RAY_DIFFERENTIALS__
-        *domega_in_dx = dTdx;
-        *domega_in_dy = dTdy;
-#endif
 
         if (alpha_x * alpha_y <= 1e-7f || fabsf(m_eta - 1.0f) < 1e-4f) {
           /* some high number for MIS */
           *pdf = 1e6f;
-          *eval = make_float3(1e6f, 1e6f, 1e6f);
+          *eval = make_spectrum(1e6f);
           label = LABEL_TRANSMIT | LABEL_SINGULAR;
         }
         else {
@@ -750,11 +720,11 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
           float out = G1i * fabsf(cosHI * cosHO) * common;
           *pdf = cosHO * fabsf(cosHI) * common;
 
-          *eval = make_float3(out, out, out);
+          *eval = make_spectrum(out);
         }
       }
       else {
-        *eval = make_float3(0.0f, 0.0f, 0.0f);
+        *eval = zero_spectrum();
         *pdf = 0.0f;
       }
     }
@@ -835,10 +805,10 @@ ccl_device_inline float bsdf_beckmann_aniso_G1(
   return ((2.181f * a + 3.535f) * a) / ((2.577f * a + 2.276f) * a + 1.0f);
 }
 
-ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                        const float3 I,
-                                                        const float3 omega_in,
-                                                        ccl_private float *pdf)
+ccl_device Spectrum bsdf_microfacet_beckmann_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                          const float3 I,
+                                                          const float3 omega_in,
+                                                          ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
   float alpha_x = bsdf->alpha_x;
@@ -848,7 +818,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(ccl_private const Shader
 
   if (m_refractive || alpha_x * alpha_y <= 1e-7f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   float cosNO = dot(N, I);
@@ -910,16 +880,16 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_reflect(ccl_private const Shader
      * pdf = pm * 0.25 / dot(m, I); */
     *pdf = G1o * common;
 
-    return make_float3(out, out, out);
+    return make_spectrum(out);
   }
 
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                         const float3 I,
-                                                         const float3 omega_in,
-                                                         ccl_private float *pdf)
+ccl_device Spectrum bsdf_microfacet_beckmann_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                           const float3 I,
+                                                           const float3 omega_in,
+                                                           ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
   float alpha_x = bsdf->alpha_x;
@@ -930,7 +900,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(ccl_private const Shade
 
   if (!m_refractive || alpha_x * alpha_y <= 1e-7f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   float cosNO = dot(N, I);
@@ -938,7 +908,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(ccl_private const Shade
 
   if (cosNO <= 0 || cosNI >= 0) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
   /* compute half-vector of the refraction (eq. 16) */
   float3 ht = -(m_eta * omega_in + I);
@@ -971,21 +941,17 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(ccl_private const Shade
   float out = G * fabsf(cosHI * cosHO) * common;
   *pdf = G1o * fabsf(cosHO * cosHI) * common;
 
-  return make_float3(out, out, out);
+  return make_spectrum(out);
 }
 
 ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
                                                ccl_private const ShaderClosure *sc,
                                                float3 Ng,
                                                float3 I,
-                                               float3 dIdx,
-                                               float3 dIdy,
                                                float randu,
                                                float randv,
-                                               ccl_private float3 *eval,
+                                               ccl_private Spectrum *eval,
                                                ccl_private float3 *omega_in,
-                                               ccl_private float3 *domega_in_dx,
-                                               ccl_private float3 *domega_in_dy,
                                                ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
@@ -1028,7 +994,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
           if (alpha_x * alpha_y <= 1e-7f) {
             /* some high number for MIS */
             *pdf = 1e6f;
-            *eval = make_float3(1e6f, 1e6f, 1e6f);
+            *eval = make_spectrum(1e6f);
             label = LABEL_REFLECT | LABEL_SINGULAR;
           }
           else {
@@ -1074,16 +1040,11 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
             float out = G * common;
             *pdf = G1o * common;
 
-            *eval = make_float3(out, out, out);
+            *eval = make_spectrum(out);
           }
-
-#ifdef __RAY_DIFFERENTIALS__
-          *domega_in_dx = (2 * dot(m, dIdx)) * m - dIdx;
-          *domega_in_dy = (2 * dot(m, dIdy)) * m - dIdy;
-#endif
         }
         else {
-          *eval = make_float3(0.0f, 0.0f, 0.0f);
+          *eval = zero_spectrum();
           *pdf = 0.0f;
         }
       }
@@ -1094,39 +1055,18 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
       /* CAUTION: the i and o variables are inverted relative to the paper
        * eq. 39 - compute actual refractive direction */
       float3 R, T;
-#ifdef __RAY_DIFFERENTIALS__
-      float3 dRdx, dRdy, dTdx, dTdy;
-#endif
       float m_eta = bsdf->ior, fresnel;
       bool inside;
 
-      fresnel = fresnel_dielectric(m_eta,
-                                   m,
-                                   I,
-                                   &R,
-                                   &T,
-#ifdef __RAY_DIFFERENTIALS__
-                                   dIdx,
-                                   dIdy,
-                                   &dRdx,
-                                   &dRdy,
-                                   &dTdx,
-                                   &dTdy,
-#endif
-                                   &inside);
+      fresnel = fresnel_dielectric(m_eta, m, I, &R, &T, &inside);
 
       if (!inside && fresnel != 1.0f) {
         *omega_in = T;
 
-#ifdef __RAY_DIFFERENTIALS__
-        *domega_in_dx = dTdx;
-        *domega_in_dy = dTdy;
-#endif
-
         if (alpha_x * alpha_y <= 1e-7f || fabsf(m_eta - 1.0f) < 1e-4f) {
           /* some high number for MIS */
           *pdf = 1e6f;
-          *eval = make_float3(1e6f, 1e6f, 1e6f);
+          *eval = make_spectrum(1e6f);
           label = LABEL_TRANSMIT | LABEL_SINGULAR;
         }
         else {
@@ -1155,11 +1095,11 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
           float out = G * fabsf(cosHI * cosHO) * common;
           *pdf = G1o * cosHO * fabsf(cosHI) * common;
 
-          *eval = make_float3(out, out, out);
+          *eval = make_spectrum(out);
         }
       }
       else {
-        *eval = make_float3(0.0f, 0.0f, 0.0f);
+        *eval = zero_spectrum();
         *pdf = 0.0f;
       }
     }
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 10027ae9f77..9402ce11f7a 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -95,29 +95,29 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi,
 
 /* Phase function for reflective materials. */
 ccl_device_forceinline float3 mf_sample_phase_glossy(const float3 wi,
-                                                     ccl_private float3 *weight,
+                                                     ccl_private Spectrum *weight,
                                                      const float3 wm)
 {
   return -wi + 2.0f * wm * dot(wi, wm);
 }
 
-ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w,
-                                                   const float lambda,
-                                                   const float3 wo,
-                                                   const float2 alpha)
+ccl_device_forceinline Spectrum mf_eval_phase_glossy(const float3 w,
+                                                     const float lambda,
+                                                     const float3 wo,
+                                                     const float2 alpha)
 {
   if (w.z > 0.9999f)
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
 
   const float3 wh = normalize(wo - w);
   if (wh.z < 0.0f)
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
 
   float pArea = (w.z < -0.9999f) ? 1.0f : lambda * w.z;
 
   const float dotW_WH = dot(-w, wh);
   if (dotW_WH < 0.0f)
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
 
   float phase = max(0.0f, dotW_WH) * 0.25f / max(pArea * dotW_WH, 1e-7f);
   if (alpha.x == alpha.y)
@@ -125,7 +125,7 @@ ccl_device_forceinline float3 mf_eval_phase_glossy(const float3 w,
   else
     phase *= D_ggx_aniso(wh, alpha);
 
-  return make_float3(phase, phase, phase);
+  return make_spectrum(phase);
 }
 
 /* Phase function for dielectric transmissive materials, including both reflection and refraction
@@ -148,22 +148,22 @@ ccl_device_forceinline float3 mf_sample_phase_glass(const float3 wi,
   return normalize(wm * (cosI * inv_eta + cosT) - wi * inv_eta);
 }
 
-ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w,
-                                                  const float lambda,
-                                                  const float3 wo,
-                                                  const bool wo_outside,
-                                                  const float2 alpha,
-                                                  const float eta)
+ccl_device_forceinline Spectrum mf_eval_phase_glass(const float3 w,
+                                                    const float lambda,
+                                                    const float3 wo,
+                                                    const bool wo_outside,
+                                                    const float2 alpha,
+                                                    const float eta)
 {
   if (w.z > 0.9999f)
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
 
   float pArea = (w.z < -0.9999f) ? 1.0f : lambda * w.z;
   float v;
   if (wo_outside) {
     const float3 wh = normalize(wo - w);
     if (wh.z < 0.0f)
-      return make_float3(0.0f, 0.0f, 0.0f);
+      return zero_spectrum();
 
     const float dotW_WH = dot(-w, wh);
     v = fresnel_dielectric_cos(dotW_WH, eta) * max(0.0f, dotW_WH) * D_ggx(wh, alpha.x) * 0.25f /
@@ -175,14 +175,14 @@ ccl_device_forceinline float3 mf_eval_phase_glass(const float3 w,
       wh = -wh;
     const float dotW_WH = dot(-w, wh), dotWO_WH = dot(wo, wh);
     if (dotW_WH < 0.0f)
-      return make_float3(0.0f, 0.0f, 0.0f);
+      return zero_spectrum();
 
     float temp = dotW_WH + eta * dotWO_WH;
     v = (1.0f - fresnel_dielectric_cos(dotW_WH, eta)) * max(0.0f, dotW_WH) * max(0.0f, -dotWO_WH) *
         D_ggx(wh, alpha.x) / (pArea * temp * temp);
   }
 
-  return make_float3(v, v, v);
+  return make_spectrum(v);
 }
 
 /* === Utility functions for the random walks === */
@@ -371,14 +371,14 @@ ccl_device void bsdf_microfacet_multi_ggx_blur(ccl_private ShaderClosure *sc, fl
 
 /* === Closure implementations === */
 
-/* Multiscattering GGX Glossy closure */
+/* Multi-scattering GGX Glossy closure */
 
 ccl_device int bsdf_microfacet_multi_ggx_common_setup(ccl_private MicrofacetBsdf *bsdf)
 {
   bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
   bsdf->alpha_y = clamp(bsdf->alpha_y, 1e-4f, 1.0f);
-  bsdf->extra->color = saturate3(bsdf->extra->color);
-  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
+  bsdf->extra->color = saturate(bsdf->extra->color);
+  bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
 
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
@@ -415,27 +415,27 @@ ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(ccl_private Microfacet
   return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
 
-ccl_device float3 bsdf_microfacet_multi_ggx_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                          const float3 I,
-                                                          const float3 omega_in,
-                                                          ccl_private float *pdf,
-                                                          ccl_private uint *lcg_state)
+ccl_device Spectrum bsdf_microfacet_multi_ggx_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                            const float3 I,
+                                                            const float3 omega_in,
+                                                            ccl_private float *pdf,
+                                                            ccl_private uint *lcg_state)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                         const float3 I,
-                                                         const float3 omega_in,
-                                                         ccl_private float *pdf,
-                                                         ccl_private uint *lcg_state)
+ccl_device Spectrum bsdf_microfacet_multi_ggx_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                           const float3 I,
+                                                           const float3 omega_in,
+                                                           ccl_private float *pdf,
+                                                           ccl_private uint *lcg_state)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
 
   if (bsdf->alpha_x * bsdf->alpha_y < 1e-7f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   float3 X, Y, Z;
@@ -444,7 +444,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(ccl_private const Shade
   /* Ensure that the both directions are on the outside w.r.t. the shading normal. */
   if (dot(Z, I) <= 0.0f || dot(Z, omega_in) <= 0.0f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID);
@@ -478,14 +478,10 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals kg,
                                                 ccl_private const ShaderClosure *sc,
                                                 float3 Ng,
                                                 float3 I,
-                                                float3 dIdx,
-                                                float3 dIdy,
                                                 float randu,
                                                 float randv,
-                                                ccl_private float3 *eval,
+                                                ccl_private Spectrum *eval,
                                                 ccl_private float3 *omega_in,
-                                                ccl_private float3 *domega_in_dx,
-                                                ccl_private float3 *domega_in_dy,
                                                 ccl_private float *pdf,
                                                 ccl_private uint *lcg_state)
 {
@@ -509,11 +505,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals kg,
       return LABEL_NONE;
     }
     *pdf = 1e6f;
-    *eval = make_float3(1e6f, 1e6f, 1e6f);
-#ifdef __RAY_DIFFERENTIALS__
-    *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
-    *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
-#endif
+    *eval = make_spectrum(1e6f);
     return LABEL_REFLECT | LABEL_SINGULAR;
   }
 
@@ -551,21 +543,17 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals kg,
     *pdf = mf_ggx_pdf(localI, localO, bsdf->alpha_x);
   *eval *= *pdf;
 
-#ifdef __RAY_DIFFERENTIALS__
-  *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
-  *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
-#endif
   return LABEL_REFLECT | LABEL_GLOSSY;
 }
 
-/* Multiscattering GGX Glass closure */
+/* Multi-scattering GGX Glass closure */
 
 ccl_device int bsdf_microfacet_multi_ggx_glass_setup(ccl_private MicrofacetBsdf *bsdf)
 {
   bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
   bsdf->alpha_y = bsdf->alpha_x;
   bsdf->ior = max(0.0f, bsdf->ior);
-  bsdf->extra->color = saturate3(bsdf->extra->color);
+  bsdf->extra->color = saturate(bsdf->extra->color);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;
 
@@ -578,8 +566,8 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(ccl_private Microfa
   bsdf->alpha_x = clamp(bsdf->alpha_x, 1e-4f, 1.0f);
   bsdf->alpha_y = bsdf->alpha_x;
   bsdf->ior = max(0.0f, bsdf->ior);
-  bsdf->extra->color = saturate3(bsdf->extra->color);
-  bsdf->extra->cspec0 = saturate3(bsdf->extra->cspec0);
+  bsdf->extra->color = saturate(bsdf->extra->color);
+  bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
 
   bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;
 
@@ -588,7 +576,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(ccl_private Microfa
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
 
-ccl_device float3
+ccl_device Spectrum
 bsdf_microfacet_multi_ggx_glass_eval_transmit(ccl_private const ShaderClosure *sc,
                                               const float3 I,
                                               const float3 omega_in,
@@ -599,7 +587,7 @@ bsdf_microfacet_multi_ggx_glass_eval_transmit(ccl_private const ShaderClosure *s
 
   if (bsdf->alpha_x * bsdf->alpha_y < 1e-7f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   float3 X, Y, Z;
@@ -622,17 +610,18 @@ bsdf_microfacet_multi_ggx_glass_eval_transmit(ccl_private const ShaderClosure *s
                        bsdf->extra->color);
 }
 
-ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                               const float3 I,
-                                                               const float3 omega_in,
-                                                               ccl_private float *pdf,
-                                                               ccl_private uint *lcg_state)
+ccl_device Spectrum
+bsdf_microfacet_multi_ggx_glass_eval_reflect(ccl_private const ShaderClosure *sc,
+                                             const float3 I,
+                                             const float3 omega_in,
+                                             ccl_private float *pdf,
+                                             ccl_private uint *lcg_state)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
 
   if (bsdf->alpha_x * bsdf->alpha_y < 1e-7f) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID);
@@ -661,14 +650,10 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals kg,
                                                       ccl_private const ShaderClosure *sc,
                                                       float3 Ng,
                                                       float3 I,
-                                                      float3 dIdx,
-                                                      float3 dIdy,
                                                       float randu,
                                                       float randv,
-                                                      ccl_private float3 *eval,
+                                                      ccl_private Spectrum *eval,
                                                       ccl_private float3 *omega_in,
-                                                      ccl_private float3 *domega_in_dx,
-                                                      ccl_private float3 *domega_in_dy,
                                                       ccl_private float *pdf,
                                                       ccl_private uint *lcg_state)
 {
@@ -679,41 +664,17 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals kg,
 
   if (bsdf->alpha_x * bsdf->alpha_y < 1e-7f) {
     float3 R, T;
-#ifdef __RAY_DIFFERENTIALS__
-    float3 dRdx, dRdy, dTdx, dTdy;
-#endif
     bool inside;
-    float fresnel = fresnel_dielectric(bsdf->ior,
-                                       Z,
-                                       I,
-                                       &R,
-                                       &T,
-#ifdef __RAY_DIFFERENTIALS__
-                                       dIdx,
-                                       dIdy,
-                                       &dRdx,
-                                       &dRdy,
-                                       &dTdx,
-                                       &dTdy,
-#endif
-                                       &inside);
+    float fresnel = fresnel_dielectric(bsdf->ior, Z, I, &R, &T, &inside);
 
     *pdf = 1e6f;
-    *eval = make_float3(1e6f, 1e6f, 1e6f);
+    *eval = make_spectrum(1e6f);
     if (randu < fresnel) {
       *omega_in = R;
-#ifdef __RAY_DIFFERENTIALS__
-      *domega_in_dx = dRdx;
-      *domega_in_dy = dRdy;
-#endif
       return LABEL_REFLECT | LABEL_SINGULAR;
     }
     else {
       *omega_in = T;
-#ifdef __RAY_DIFFERENTIALS__
-      *domega_in_dx = dTdx;
-      *domega_in_dy = dTdy;
-#endif
       return LABEL_TRANSMIT | LABEL_SINGULAR;
     }
   }
@@ -739,22 +700,9 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals kg,
 
   *omega_in = X * localO.x + Y * localO.y + Z * localO.z;
   if (localO.z * localI.z > 0.0f) {
-#ifdef __RAY_DIFFERENTIALS__
-    *domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
-    *domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
-#endif
     return LABEL_REFLECT | LABEL_GLOSSY;
   }
   else {
-#ifdef __RAY_DIFFERENTIALS__
-    float cosI = dot(Z, I);
-    float dnp = max(sqrtf(1.0f - (bsdf->ior * bsdf->ior * (1.0f - cosI * cosI))), 1e-7f);
-    *domega_in_dx = -(bsdf->ior * dIdx) +
-                    ((bsdf->ior - bsdf->ior * bsdf->ior * cosI / dnp) * dot(dIdx, Z)) * Z;
-    *domega_in_dy = -(bsdf->ior * dIdy) +
-                    ((bsdf->ior - bsdf->ior * bsdf->ior * cosI / dnp) * dot(dIdy, Z)) * Z;
-#endif
-
     return LABEL_TRANSMIT | LABEL_GLOSSY;
   }
 }
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index e4fcf0e6ba3..91fb9158050 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -12,16 +12,16 @@
  * multi-scattered energy is used. In combination with MIS, that is enough to produce an unbiased
  * result, although the balance heuristic isn't necessarily optimal anymore.
  */
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
-                                                             float3 wo,
-                                                             const bool wo_outside,
-                                                             const float3 color,
-                                                             const float alpha_x,
-                                                             const float alpha_y,
-                                                             ccl_private uint *lcg_state,
-                                                             const float eta,
-                                                             bool use_fresnel,
-                                                             const float3 cspec0)
+ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
+                                                               float3 wo,
+                                                               const bool wo_outside,
+                                                               const Spectrum color,
+                                                               const float alpha_x,
+                                                               const float alpha_y,
+                                                               ccl_private uint *lcg_state,
+                                                               const float eta,
+                                                               bool use_fresnel,
+                                                               const Spectrum cspec0)
 {
   /* Evaluating for a shallower incoming direction produces less noise, and the properties of the
    * BSDF guarantee reciprocity. */
@@ -46,7 +46,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
   }
 
   if (wi.z < 1e-5f || (wo.z < 1e-5f && wo_outside) || (wo.z > -1e-5f && !wo_outside))
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
 
   const float2 alpha = make_float2(alpha_x, alpha_y);
 
@@ -54,8 +54,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
   float shadowing_lambda = mf_lambda(wo_outside ? wo : -wo, alpha);
 
   /* Analytically compute single scattering for lower noise. */
-  float3 eval;
-  float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+  Spectrum eval;
+  Spectrum throughput = one_spectrum();
   const float3 wh = normalize(wi + wo);
 #ifdef MF_MULTI_GLASS
   eval = mf_eval_phase_glass(-wi, lambda_r, wo, wo_outside, alpha, eta);
@@ -70,7 +70,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
     val *= D_ggx(wh, alpha.x);
   else
     val *= D_ggx_aniso(wh, alpha);
-  eval = make_float3(val, val, val);
+  eval = make_spectrum(val);
 #endif
 
   float F0 = fresnel_dielectric_cos(1.0f, eta);
@@ -99,7 +99,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
 #ifdef MF_MULTI_GLASS
     if (order == 0 && use_fresnel) {
       /* Evaluate amount of scattering towards wo on this microfacet. */
-      float3 phase;
+      Spectrum phase;
       if (outside)
         phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
       else
@@ -113,7 +113,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
 #endif
     if (order > 0) {
       /* Evaluate amount of scattering towards wo on this microfacet. */
-      float3 phase;
+      Spectrum phase;
 #ifdef MF_MULTI_GLASS
       if (outside)
         phase = mf_eval_phase_glass(wr, lambda_r, wo, wo_outside, alpha, eta);
@@ -172,19 +172,19 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
  * walk escaped the surface in wo. The function returns the throughput between wi and wo. Without
  * reflection losses due to coloring or fresnel absorption in conductors, the sampling is optimal.
  */
-ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
-                                                               ccl_private float3 *wo,
-                                                               const float3 color,
-                                                               const float alpha_x,
-                                                               const float alpha_y,
-                                                               ccl_private uint *lcg_state,
-                                                               const float eta,
-                                                               bool use_fresnel,
-                                                               const float3 cspec0)
+ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
+                                                                 ccl_private float3 *wo,
+                                                                 const Spectrum color,
+                                                                 const float alpha_x,
+                                                                 const float alpha_y,
+                                                                 ccl_private uint *lcg_state,
+                                                                 const float eta,
+                                                                 bool use_fresnel,
+                                                                 const Spectrum cspec0)
 {
   const float2 alpha = make_float2(alpha_x, alpha_y);
 
-  float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+  Spectrum throughput = one_spectrum();
   float3 wr = -wi;
   float lambda_r = mf_lambda(wr, alpha);
   float hr = 1.0f;
@@ -229,7 +229,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
         throughput *= color;
       }
       else {
-        float3 t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+        Spectrum t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
 
         if (order == 0)
           throughput = t_color;
@@ -239,7 +239,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
     }
 #else /* MF_MULTI_GLOSSY */
     if (use_fresnel) {
-      float3 t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+      Spectrum t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
 
       if (order == 0)
         throughput = t_color;
@@ -254,7 +254,7 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
     G1_r = mf_G1(wr, C1_r, lambda_r);
   }
   *wo = make_float3(0.0f, 0.0f, 1.0f);
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 #undef MF_MULTI_GLASS
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 56c7ec869c7..b85390f0676 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -15,10 +15,10 @@ typedef struct OrenNayarBsdf {
 
 static_assert(sizeof(ShaderClosure) >= sizeof(OrenNayarBsdf), "OrenNayarBsdf is too large!");
 
-ccl_device float3 bsdf_oren_nayar_get_intensity(ccl_private const ShaderClosure *sc,
-                                                float3 n,
-                                                float3 v,
-                                                float3 l)
+ccl_device Spectrum bsdf_oren_nayar_get_intensity(ccl_private const ShaderClosure *sc,
+                                                  float3 n,
+                                                  float3 v,
+                                                  float3 l)
 {
   ccl_private const OrenNayarBsdf *bsdf = (ccl_private const OrenNayarBsdf *)sc;
   float nl = max(dot(n, l), 0.0f);
@@ -28,7 +28,7 @@ ccl_device float3 bsdf_oren_nayar_get_intensity(ccl_private const ShaderClosure
   if (t > 0.0f)
     t /= max(nl, nv) + FLT_MIN;
   float is = nl * (bsdf->a + bsdf->b * t);
-  return make_float3(is, is, is);
+  return make_spectrum(is);
 }
 
 ccl_device int bsdf_oren_nayar_setup(ccl_private OrenNayarBsdf *bsdf)
@@ -47,10 +47,10 @@ ccl_device int bsdf_oren_nayar_setup(ccl_private OrenNayarBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_oren_nayar_eval_reflect(ccl_private const ShaderClosure *sc,
-                                               const float3 I,
-                                               const float3 omega_in,
-                                               ccl_private float *pdf)
+ccl_device Spectrum bsdf_oren_nayar_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                 const float3 I,
+                                                 const float3 omega_in,
+                                                 ccl_private float *pdf)
 {
   ccl_private const OrenNayarBsdf *bsdf = (ccl_private const OrenNayarBsdf *)sc;
   if (dot(bsdf->N, omega_in) > 0.0f) {
@@ -59,30 +59,26 @@ ccl_device float3 bsdf_oren_nayar_eval_reflect(ccl_private const ShaderClosure *
   }
   else {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 }
 
-ccl_device float3 bsdf_oren_nayar_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                const float3 I,
-                                                const float3 omega_in,
-                                                ccl_private float *pdf)
+ccl_device Spectrum bsdf_oren_nayar_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                  const float3 I,
+                                                  const float3 omega_in,
+                                                  ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_oren_nayar_sample(ccl_private const ShaderClosure *sc,
                                       float3 Ng,
                                       float3 I,
-                                      float3 dIdx,
-                                      float3 dIdy,
                                       float randu,
                                       float randv,
-                                      ccl_private float3 *eval,
+                                      ccl_private Spectrum *eval,
                                       ccl_private float3 *omega_in,
-                                      ccl_private float3 *domega_in_dx,
-                                      ccl_private float3 *domega_in_dy,
                                       ccl_private float *pdf)
 {
   ccl_private const OrenNayarBsdf *bsdf = (ccl_private const OrenNayarBsdf *)sc;
@@ -90,16 +86,10 @@ ccl_device int bsdf_oren_nayar_sample(ccl_private const ShaderClosure *sc,
 
   if (dot(Ng, *omega_in) > 0.0f) {
     *eval = bsdf_oren_nayar_get_intensity(sc, bsdf->N, I, *omega_in);
-
-#ifdef __RAY_DIFFERENTIALS__
-    // TODO: find a better approximation for the bounce
-    *domega_in_dx = (2.0f * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
-    *domega_in_dy = (2.0f * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
-#endif
   }
   else {
     *pdf = 0.0f;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
 
   return LABEL_REFLECT | LABEL_DIFFUSE;
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index 74a1f7ae090..4236e77ae6c 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -8,6 +8,8 @@
 
 #pragma once
 
+#include "kernel/util/color.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __OSL__
@@ -42,10 +44,10 @@ ccl_device int bsdf_phong_ramp_setup(ccl_private PhongRampBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_phong_ramp_eval_reflect(ccl_private const ShaderClosure *sc,
-                                               const float3 I,
-                                               const float3 omega_in,
-                                               ccl_private float *pdf)
+ccl_device Spectrum bsdf_phong_ramp_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                 const float3 I,
+                                                 const float3 omega_in,
+                                                 ccl_private float *pdf)
 {
   ccl_private const PhongRampBsdf *bsdf = (ccl_private const PhongRampBsdf *)sc;
   float m_exponent = bsdf->exponent;
@@ -61,11 +63,11 @@ ccl_device float3 bsdf_phong_ramp_eval_reflect(ccl_private const ShaderClosure *
       float common = 0.5f * M_1_PI_F * cosp;
       float out = cosNI * (m_exponent + 2) * common;
       *pdf = (m_exponent + 1) * common;
-      return bsdf_phong_ramp_get_color(bsdf->colors, cosp) * out;
+      return rgb_to_spectrum(bsdf_phong_ramp_get_color(bsdf->colors, cosp) * out);
     }
   }
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device float3 bsdf_phong_ramp_eval_transmit(ccl_private const ShaderClosure *sc,
@@ -80,14 +82,10 @@ ccl_device float3 bsdf_phong_ramp_eval_transmit(ccl_private const ShaderClosure
 ccl_device int bsdf_phong_ramp_sample(ccl_private const ShaderClosure *sc,
                                       float3 Ng,
                                       float3 I,
-                                      float3 dIdx,
-                                      float3 dIdy,
                                       float randu,
                                       float randv,
-                                      ccl_private float3 *eval,
+                                      ccl_private Spectrum *eval,
                                       ccl_private float3 *omega_in,
-                                      ccl_private float3 *domega_in_dx,
-                                      ccl_private float3 *domega_in_dy,
                                       ccl_private float *pdf)
 {
   ccl_private const PhongRampBsdf *bsdf = (ccl_private const PhongRampBsdf *)sc;
@@ -97,12 +95,6 @@ ccl_device int bsdf_phong_ramp_sample(ccl_private const ShaderClosure *sc,
   if (cosNO > 0) {
     // reflect the view vector
     float3 R = (2 * cosNO) * bsdf->N - I;
-
-#  ifdef __RAY_DIFFERENTIALS__
-    *domega_in_dx = (2 * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
-    *domega_in_dy = (2 * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
-#  endif
-
     float3 T, B;
     make_orthonormals(R, &T, &B);
     float phi = M_2PI_F * randu;
@@ -119,12 +111,12 @@ ccl_device int bsdf_phong_ramp_sample(ccl_private const ShaderClosure *sc,
         float common = 0.5f * M_1_PI_F * cosp;
         *pdf = (m_exponent + 1) * common;
         float out = cosNI * (m_exponent + 2) * common;
-        *eval = bsdf_phong_ramp_get_color(bsdf->colors, cosp) * out;
+        *eval = rgb_to_spectrum(bsdf_phong_ramp_get_color(bsdf->colors, cosp) * out);
       }
     }
   }
   else {
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
     *pdf = 0.0f;
   }
   return LABEL_REFLECT | LABEL_GLOSSY;
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index 5a7020e82d2..39cca1bd970 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -42,7 +42,7 @@ ccl_device int bsdf_principled_diffuse_setup(ccl_private PrincipledDiffuseBsdf *
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3
+ccl_device Spectrum
 bsdf_principled_diffuse_compute_brdf(ccl_private const PrincipledDiffuseBsdf *bsdf,
                                      float3 N,
                                      float3 V,
@@ -52,7 +52,7 @@ bsdf_principled_diffuse_compute_brdf(ccl_private const PrincipledDiffuseBsdf *bs
   const float NdotL = dot(N, L);
 
   if (NdotL <= 0) {
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   const float NdotV = dot(N, V);
@@ -82,7 +82,7 @@ bsdf_principled_diffuse_compute_brdf(ccl_private const PrincipledDiffuseBsdf *bs
 
   float value = M_1_PI_F * NdotL * f;
 
-  return make_float3(value, value, value);
+  return make_spectrum(value);
 }
 
 /* Compute Fresnel at entry point, to be combined with #PRINCIPLED_DIFFUSE_LAMBERT_EXIT
@@ -109,10 +109,10 @@ ccl_device int bsdf_principled_diffuse_setup(ccl_private PrincipledDiffuseBsdf *
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_principled_diffuse_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                       const float3 I,
-                                                       const float3 omega_in,
-                                                       ccl_private float *pdf)
+ccl_device Spectrum bsdf_principled_diffuse_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                         const float3 I,
+                                                         const float3 omega_in,
+                                                         ccl_private float *pdf)
 {
   ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)sc;
 
@@ -126,30 +126,26 @@ ccl_device float3 bsdf_principled_diffuse_eval_reflect(ccl_private const ShaderC
   }
   else {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 }
 
-ccl_device float3 bsdf_principled_diffuse_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                        const float3 I,
-                                                        const float3 omega_in,
-                                                        ccl_private float *pdf)
+ccl_device Spectrum bsdf_principled_diffuse_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                          const float3 I,
+                                                          const float3 omega_in,
+                                                          ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_principled_diffuse_sample(ccl_private const ShaderClosure *sc,
                                               float3 Ng,
                                               float3 I,
-                                              float3 dIdx,
-                                              float3 dIdy,
                                               float randu,
                                               float randv,
-                                              ccl_private float3 *eval,
+                                              ccl_private Spectrum *eval,
                                               ccl_private float3 *omega_in,
-                                              ccl_private float3 *domega_in_dx,
-                                              ccl_private float3 *domega_in_dy,
                                               ccl_private float *pdf)
 {
   ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)sc;
@@ -160,16 +156,10 @@ ccl_device int bsdf_principled_diffuse_sample(ccl_private const ShaderClosure *s
 
   if (dot(Ng, *omega_in) > 0) {
     *eval = bsdf_principled_diffuse_compute_brdf(bsdf, N, I, *omega_in, pdf);
-
-#ifdef __RAY_DIFFERENTIALS__
-    // TODO: find a better approximation for the diffuse bounce
-    *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
-    *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
-#endif
   }
   else {
     *pdf = 0.0f;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
   return LABEL_REFLECT | LABEL_DIFFUSE;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 3a96a93db73..fa46f47eb21 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -32,7 +32,7 @@ ccl_device_inline float calculate_avg_principled_sheen_brdf(float3 N, float3 I)
   return schlick_fresnel(NdotI) * NdotI;
 }
 
-ccl_device float3
+ccl_device Spectrum
 calculate_principled_sheen_brdf(float3 N, float3 V, float3 L, float3 H, ccl_private float *pdf)
 {
   float NdotL = dot(N, L);
@@ -40,14 +40,14 @@ calculate_principled_sheen_brdf(float3 N, float3 V, float3 L, float3 H, ccl_priv
 
   if (NdotL < 0 || NdotV < 0) {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 
   float LdotH = dot(L, H);
 
   float value = schlick_fresnel(LdotH) * NdotL;
 
-  return make_float3(value, value, value);
+  return make_spectrum(value);
 }
 
 ccl_device int bsdf_principled_sheen_setup(ccl_private const ShaderData *sd,
@@ -59,10 +59,10 @@ ccl_device int bsdf_principled_sheen_setup(ccl_private const ShaderData *sd,
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_principled_sheen_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                     const float3 I,
-                                                     const float3 omega_in,
-                                                     ccl_private float *pdf)
+ccl_device Spectrum bsdf_principled_sheen_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                       const float3 I,
+                                                       const float3 omega_in,
+                                                       ccl_private float *pdf)
 {
   ccl_private const PrincipledSheenBsdf *bsdf = (ccl_private const PrincipledSheenBsdf *)sc;
 
@@ -77,30 +77,26 @@ ccl_device float3 bsdf_principled_sheen_eval_reflect(ccl_private const ShaderClo
   }
   else {
     *pdf = 0.0f;
-    return make_float3(0.0f, 0.0f, 0.0f);
+    return zero_spectrum();
   }
 }
 
-ccl_device float3 bsdf_principled_sheen_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                      const float3 I,
-                                                      const float3 omega_in,
-                                                      ccl_private float *pdf)
+ccl_device Spectrum bsdf_principled_sheen_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                        const float3 I,
+                                                        const float3 omega_in,
+                                                        ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_principled_sheen_sample(ccl_private const ShaderClosure *sc,
                                             float3 Ng,
                                             float3 I,
-                                            float3 dIdx,
-                                            float3 dIdy,
                                             float randu,
                                             float randv,
-                                            ccl_private float3 *eval,
+                                            ccl_private Spectrum *eval,
                                             ccl_private float3 *omega_in,
-                                            ccl_private float3 *domega_in_dx,
-                                            ccl_private float3 *domega_in_dy,
                                             ccl_private float *pdf)
 {
   ccl_private const PrincipledSheenBsdf *bsdf = (ccl_private const PrincipledSheenBsdf *)sc;
@@ -113,15 +109,9 @@ ccl_device int bsdf_principled_sheen_sample(ccl_private const ShaderClosure *sc,
     float3 H = normalize(I + *omega_in);
 
     *eval = calculate_principled_sheen_brdf(N, I, *omega_in, H, pdf);
-
-#ifdef __RAY_DIFFERENTIALS__
-    // TODO: find a better approximation for the diffuse bounce
-    *domega_in_dx = -((2 * dot(N, dIdx)) * N - dIdx);
-    *domega_in_dy = -((2 * dot(N, dIdy)) * N - dIdy);
-#endif
   }
   else {
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
     *pdf = 0.0f;
   }
   return LABEL_REFLECT | LABEL_DIFFUSE;
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index c8db2b7cf13..5e6c6cdcde6 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -18,35 +18,31 @@ ccl_device int bsdf_reflection_setup(ccl_private MicrofacetBsdf *bsdf)
   return SD_BSDF;
 }
 
-ccl_device float3 bsdf_reflection_eval_reflect(ccl_private const ShaderClosure *sc,
-                                               const float3 I,
-                                               const float3 omega_in,
-                                               ccl_private float *pdf)
+ccl_device Spectrum bsdf_reflection_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                 const float3 I,
+                                                 const float3 omega_in,
+                                                 ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_reflection_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                const float3 I,
-                                                const float3 omega_in,
-                                                ccl_private float *pdf)
+ccl_device Spectrum bsdf_reflection_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                  const float3 I,
+                                                  const float3 omega_in,
+                                                  ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_reflection_sample(ccl_private const ShaderClosure *sc,
                                       float3 Ng,
                                       float3 I,
-                                      float3 dIdx,
-                                      float3 dIdy,
                                       float randu,
                                       float randv,
-                                      ccl_private float3 *eval,
+                                      ccl_private Spectrum *eval,
                                       ccl_private float3 *omega_in,
-                                      ccl_private float3 *domega_in_dx,
-                                      ccl_private float3 *domega_in_dy,
                                       ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
@@ -57,18 +53,14 @@ ccl_device int bsdf_reflection_sample(ccl_private const ShaderClosure *sc,
   if (cosNO > 0) {
     *omega_in = (2 * cosNO) * N - I;
     if (dot(Ng, *omega_in) > 0) {
-#ifdef __RAY_DIFFERENTIALS__
-      *domega_in_dx = 2 * dot(N, dIdx) * N - dIdx;
-      *domega_in_dy = 2 * dot(N, dIdy) * N - dIdy;
-#endif
       /* Some high number for MIS. */
       *pdf = 1e6f;
-      *eval = make_float3(1e6f, 1e6f, 1e6f);
+      *eval = make_spectrum(1e6f);
     }
   }
   else {
     *pdf = 0.0f;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
   return LABEL_REFLECT | LABEL_SINGULAR;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index 862e774da87..e680a9617db 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -18,35 +18,31 @@ ccl_device int bsdf_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
   return SD_BSDF;
 }
 
-ccl_device float3 bsdf_refraction_eval_reflect(ccl_private const ShaderClosure *sc,
-                                               const float3 I,
-                                               const float3 omega_in,
-                                               ccl_private float *pdf)
+ccl_device Spectrum bsdf_refraction_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                 const float3 I,
+                                                 const float3 omega_in,
+                                                 ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_refraction_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                const float3 I,
-                                                const float3 omega_in,
-                                                ccl_private float *pdf)
+ccl_device Spectrum bsdf_refraction_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                  const float3 I,
+                                                  const float3 omega_in,
+                                                  ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_refraction_sample(ccl_private const ShaderClosure *sc,
                                       float3 Ng,
                                       float3 I,
-                                      float3 dIdx,
-                                      float3 dIdy,
                                       float randu,
                                       float randv,
-                                      ccl_private float3 *eval,
+                                      ccl_private Spectrum *eval,
                                       ccl_private float3 *omega_in,
-                                      ccl_private float3 *domega_in_dx,
-                                      ccl_private float3 *domega_in_dy,
                                       ccl_private float *pdf)
 {
   ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
@@ -54,39 +50,19 @@ ccl_device int bsdf_refraction_sample(ccl_private const ShaderClosure *sc,
   float3 N = bsdf->N;
 
   float3 R, T;
-#ifdef __RAY_DIFFERENTIALS__
-  float3 dRdx, dRdy, dTdx, dTdy;
-#endif
   bool inside;
   float fresnel;
-  fresnel = fresnel_dielectric(m_eta,
-                               N,
-                               I,
-                               &R,
-                               &T,
-#ifdef __RAY_DIFFERENTIALS__
-                               dIdx,
-                               dIdy,
-                               &dRdx,
-                               &dRdy,
-                               &dTdx,
-                               &dTdy,
-#endif
-                               &inside);
+  fresnel = fresnel_dielectric(m_eta, N, I, &R, &T, &inside);
 
   if (!inside && fresnel != 1.0f) {
     /* Some high number for MIS. */
     *pdf = 1e6f;
-    *eval = make_float3(1e6f, 1e6f, 1e6f);
+    *eval = make_spectrum(1e6f);
     *omega_in = T;
-#ifdef __RAY_DIFFERENTIALS__
-    *domega_in_dx = dTdx;
-    *domega_in_dy = dTdy;
-#endif
   }
   else {
     *pdf = 0.0f;
-    *eval = make_float3(0.0f, 0.0f, 0.0f);
+    *eval = zero_spectrum();
   }
   return LABEL_TRANSMIT | LABEL_SINGULAR;
 }
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index 0400fc61860..c9086823de9 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -30,7 +30,7 @@ ccl_device int bsdf_diffuse_toon_setup(ccl_private ToonBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
+ccl_device float bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
   float is;
 
@@ -41,7 +41,7 @@ ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float a
   else
     is = 0.0f;
 
-  return make_float3(is, is, is);
+  return is;
 }
 
 ccl_device float bsdf_toon_get_sample_angle(float max_angle, float smooth)
@@ -49,48 +49,44 @@ ccl_device float bsdf_toon_get_sample_angle(float max_angle, float smooth)
   return fminf(max_angle + smooth, M_PI_2_F);
 }
 
-ccl_device float3 bsdf_diffuse_toon_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                 const float3 I,
-                                                 const float3 omega_in,
-                                                 ccl_private float *pdf)
+ccl_device Spectrum bsdf_diffuse_toon_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                   const float3 I,
+                                                   const float3 omega_in,
+                                                   ccl_private float *pdf)
 {
   ccl_private const ToonBsdf *bsdf = (ccl_private const ToonBsdf *)sc;
   float max_angle = bsdf->size * M_PI_2_F;
   float smooth = bsdf->smooth * M_PI_2_F;
   float angle = safe_acosf(fmaxf(dot(bsdf->N, omega_in), 0.0f));
 
-  float3 eval = bsdf_toon_get_intensity(max_angle, smooth, angle);
+  float eval = bsdf_toon_get_intensity(max_angle, smooth, angle);
 
-  if (eval.x > 0.0f) {
+  if (eval > 0.0f) {
     float sample_angle = bsdf_toon_get_sample_angle(max_angle, smooth);
 
     *pdf = 0.5f * M_1_PI_F / (1.0f - cosf(sample_angle));
-    return *pdf * eval;
+    return make_spectrum(*pdf * eval);
   }
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_diffuse_toon_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                  const float3 I,
-                                                  const float3 omega_in,
-                                                  ccl_private float *pdf)
+ccl_device Spectrum bsdf_diffuse_toon_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                    const float3 I,
+                                                    const float3 omega_in,
+                                                    ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_diffuse_toon_sample(ccl_private const ShaderClosure *sc,
                                         float3 Ng,
                                         float3 I,
-                                        float3 dIdx,
-                                        float3 dIdy,
                                         float randu,
                                         float randv,
-                                        ccl_private float3 *eval,
+                                        ccl_private Spectrum *eval,
                                         ccl_private float3 *omega_in,
-                                        ccl_private float3 *domega_in_dx,
-                                        ccl_private float3 *domega_in_dy,
                                         ccl_private float *pdf)
 {
   ccl_private const ToonBsdf *bsdf = (ccl_private const ToonBsdf *)sc;
@@ -103,21 +99,15 @@ ccl_device int bsdf_diffuse_toon_sample(ccl_private const ShaderClosure *sc,
     sample_uniform_cone(bsdf->N, sample_angle, randu, randv, omega_in, pdf);
 
     if (dot(Ng, *omega_in) > 0.0f) {
-      *eval = *pdf * bsdf_toon_get_intensity(max_angle, smooth, angle);
-
-#ifdef __RAY_DIFFERENTIALS__
-      // TODO: find a better approximation for the bounce
-      *domega_in_dx = (2.0f * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
-      *domega_in_dy = (2.0f * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
-#endif
+      *eval = make_spectrum(*pdf * bsdf_toon_get_intensity(max_angle, smooth, angle));
     }
     else {
-      *eval = make_float3(0.f, 0.f, 0.f);
+      *eval = zero_spectrum();
       *pdf = 0.0f;
     }
   }
   else {
-    *eval = make_float3(0.f, 0.f, 0.f);
+    *eval = zero_spectrum();
     *pdf = 0.0f;
   }
 
@@ -135,10 +125,10 @@ ccl_device int bsdf_glossy_toon_setup(ccl_private ToonBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device float3 bsdf_glossy_toon_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                const float3 I,
-                                                const float3 omega_in,
-                                                ccl_private float *pdf)
+ccl_device Spectrum bsdf_glossy_toon_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                  const float3 I,
+                                                  const float3 omega_in,
+                                                  ccl_private float *pdf)
 {
   ccl_private const ToonBsdf *bsdf = (ccl_private const ToonBsdf *)sc;
   float max_angle = bsdf->size * M_PI_2_F;
@@ -153,36 +143,32 @@ ccl_device float3 bsdf_glossy_toon_eval_reflect(ccl_private const ShaderClosure
 
     float angle = safe_acosf(fmaxf(cosRI, 0.0f));
 
-    float3 eval = bsdf_toon_get_intensity(max_angle, smooth, angle);
+    float eval = bsdf_toon_get_intensity(max_angle, smooth, angle);
     float sample_angle = bsdf_toon_get_sample_angle(max_angle, smooth);
 
     *pdf = 0.5f * M_1_PI_F / (1.0f - cosf(sample_angle));
-    return *pdf * eval;
+    return make_spectrum(*pdf * eval);
   }
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_glossy_toon_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                 const float3 I,
-                                                 const float3 omega_in,
-                                                 ccl_private float *pdf)
+ccl_device Spectrum bsdf_glossy_toon_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                   const float3 I,
+                                                   const float3 omega_in,
+                                                   ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_glossy_toon_sample(ccl_private const ShaderClosure *sc,
                                        float3 Ng,
                                        float3 I,
-                                       float3 dIdx,
-                                       float3 dIdy,
                                        float randu,
                                        float randv,
-                                       ccl_private float3 *eval,
+                                       ccl_private Spectrum *eval,
                                        ccl_private float3 *omega_in,
-                                       ccl_private float3 *domega_in_dx,
-                                       ccl_private float3 *domega_in_dy,
                                        ccl_private float *pdf)
 {
   ccl_private const ToonBsdf *bsdf = (ccl_private const ToonBsdf *)sc;
@@ -204,21 +190,16 @@ ccl_device int bsdf_glossy_toon_sample(ccl_private const ShaderClosure *sc,
 
       /* make sure the direction we chose is still in the right hemisphere */
       if (cosNI > 0) {
-        *eval = *pdf * bsdf_toon_get_intensity(max_angle, smooth, angle);
-
-#ifdef __RAY_DIFFERENTIALS__
-        *domega_in_dx = (2 * dot(bsdf->N, dIdx)) * bsdf->N - dIdx;
-        *domega_in_dy = (2 * dot(bsdf->N, dIdy)) * bsdf->N - dIdy;
-#endif
+        *eval = make_spectrum(*pdf * bsdf_toon_get_intensity(max_angle, smooth, angle));
       }
       else {
         *pdf = 0.0f;
-        *eval = make_float3(0.0f, 0.0f, 0.0f);
+        *eval = zero_spectrum();
       }
     }
     else {
       *pdf = 0.0f;
-      *eval = make_float3(0.0f, 0.0f, 0.0f);
+      *eval = zero_spectrum();
     }
   }
 
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 636d9d664f2..c2aee1e1633 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -11,7 +11,7 @@
 CCL_NAMESPACE_BEGIN
 
 ccl_device void bsdf_transparent_setup(ccl_private ShaderData *sd,
-                                       const float3 weight,
+                                       const Spectrum weight,
                                        uint32_t path_flag)
 {
   /* Check cutoff weight. */
@@ -59,45 +59,37 @@ ccl_device void bsdf_transparent_setup(ccl_private ShaderData *sd,
   }
 }
 
-ccl_device float3 bsdf_transparent_eval_reflect(ccl_private const ShaderClosure *sc,
-                                                const float3 I,
-                                                const float3 omega_in,
-                                                ccl_private float *pdf)
+ccl_device Spectrum bsdf_transparent_eval_reflect(ccl_private const ShaderClosure *sc,
+                                                  const float3 I,
+                                                  const float3 omega_in,
+                                                  ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
-ccl_device float3 bsdf_transparent_eval_transmit(ccl_private const ShaderClosure *sc,
-                                                 const float3 I,
-                                                 const float3 omega_in,
-                                                 ccl_private float *pdf)
+ccl_device Spectrum bsdf_transparent_eval_transmit(ccl_private const ShaderClosure *sc,
+                                                   const float3 I,
+                                                   const float3 omega_in,
+                                                   ccl_private float *pdf)
 {
   *pdf = 0.0f;
-  return make_float3(0.0f, 0.0f, 0.0f);
+  return zero_spectrum();
 }
 
 ccl_device int bsdf_transparent_sample(ccl_private const ShaderClosure *sc,
                                        float3 Ng,
                                        float3 I,
-                                       float3 dIdx,
-                                       float3 dIdy,
                                        float randu,
                                        float randv,
-                                       ccl_private float3 *eval,
+                                       ccl_private Spectrum *eval,
                                        ccl_private float3 *omega_in,
-                                       ccl_private float3 *domega_in_dx,
-                                       ccl_private float3 *domega_in_dy,
                                        ccl_private float *pdf)
 {
   // only one direction is possible
   *omega_in = -I;
-#ifdef __RAY_DIFFERENTIALS__
-  *domega_in_dx = -dIdx;
-  *domega_in_dy = -dIdy;
-#endif
   *pdf = 1;
-  *eval = make_float3(1, 1, 1);
+  *eval = one_spectrum();
   return LABEL_TRANSMIT | LABEL_TRANSPARENT;
 }
 
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index e3b24d487f1..3c48b98fed9 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -15,14 +15,6 @@ ccl_device float fresnel_dielectric(float eta,
                                     const float3 I,
                                     ccl_private float3 *R,
                                     ccl_private float3 *T,
-#ifdef __RAY_DIFFERENTIALS__
-                                    const float3 dIdx,
-                                    const float3 dIdy,
-                                    ccl_private float3 *dRdx,
-                                    ccl_private float3 *dRdy,
-                                    ccl_private float3 *dTdx,
-                                    ccl_private float3 *dTdy,
-#endif
                                     ccl_private bool *is_inside)
 {
   float cos = dot(N, I), neta;
@@ -45,28 +37,16 @@ ccl_device float fresnel_dielectric(float eta,
 
   // compute reflection
   *R = (2 * cos) * Nn - I;
-#ifdef __RAY_DIFFERENTIALS__
-  *dRdx = (2 * dot(Nn, dIdx)) * Nn - dIdx;
-  *dRdy = (2 * dot(Nn, dIdy)) * Nn - dIdy;
-#endif
 
   float arg = 1 - (neta * neta * (1 - (cos * cos)));
   if (arg < 0) {
     *T = make_float3(0.0f, 0.0f, 0.0f);
-#ifdef __RAY_DIFFERENTIALS__
-    *dTdx = make_float3(0.0f, 0.0f, 0.0f);
-    *dTdy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
     return 1;  // total internal reflection
   }
   else {
     float dnp = max(sqrtf(arg), 1e-7f);
     float nK = (neta * cos) - dnp;
     *T = -(neta * I) + (nK * Nn);
-#ifdef __RAY_DIFFERENTIALS__
-    *dTdx = -(neta * dIdx) + ((neta - neta * neta * cos / dnp) * dot(dIdx, Nn)) * Nn;
-    *dTdy = -(neta * dIdy) + ((neta - neta * neta * cos / dnp) * dot(dIdy, Nn)) * Nn;
-#endif
     // compute Fresnel terms
     float cosTheta1 = cos;  // N.R
     float cosTheta2 = -dot(Nn, *T);
@@ -110,8 +90,8 @@ ccl_device float schlick_fresnel(float u)
 }
 
 /* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
-ccl_device_forceinline float3
-interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0)
+ccl_device_forceinline Spectrum
+interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, Spectrum cspec0)
 {
   /* Calculate the fresnel interpolation factor
    * The value from fresnel_dielectric_cos(...) has to be normalized because
@@ -121,7 +101,7 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0
   float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
 
   /* Blend between white and a specular color with respect to the fresnel */
-  return cspec0 * (1.0f - FH) + make_float3(1.0f, 1.0f, 1.0f) * FH;
+  return cspec0 * (1.0f - FH) + make_spectrum(FH);
 }
 
 ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index b87790f5f8a..7131d9d8f38 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -8,8 +8,8 @@ CCL_NAMESPACE_BEGIN
 typedef struct Bssrdf {
   SHADER_CLOSURE_BASE;
 
-  float3 radius;
-  float3 albedo;
+  Spectrum radius;
+  Spectrum albedo;
   float roughness;
   float anisotropy;
 } Bssrdf;
@@ -69,12 +69,13 @@ ccl_device void bssrdf_setup_radius(ccl_private Bssrdf *bssrdf,
     const float fourthirdA = (4.0f / 3.0f) * (1.0f + F_dr) /
                              (1.0f - F_dr); /* From Jensen's `Fdr` ratio formula. */
 
-    const float3 alpha_prime = make_float3(
-        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.x, fourthirdA),
-        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.y, fourthirdA),
-        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.z, fourthirdA));
+    Spectrum alpha_prime;
+    FOREACH_SPECTRUM_CHANNEL (i) {
+      GET_SPECTRUM_CHANNEL(alpha_prime, i) = bssrdf_dipole_compute_alpha_prime(
+          GET_SPECTRUM_CHANNEL(bssrdf->albedo, i), fourthirdA);
+    }
 
-    bssrdf->radius *= sqrt(3.0f * (one_float3() - alpha_prime));
+    bssrdf->radius *= sqrt(3.0f * (one_spectrum() - alpha_prime));
   }
 }
 
@@ -98,7 +99,7 @@ ccl_device_inline float bssrdf_burley_fitting(float A)
 
 /* Scale mean free path length so it gives similar looking result
  * to Cubic and Gaussian models. */
-ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
+ccl_device_inline Spectrum bssrdf_burley_compatible_mfp(Spectrum r)
 {
   return 0.25f * M_1_PI_F * r;
 }
@@ -106,11 +107,13 @@ ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
 ccl_device void bssrdf_burley_setup(ccl_private Bssrdf *bssrdf)
 {
   /* Mean free path length. */
-  const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius);
+  const Spectrum l = bssrdf_burley_compatible_mfp(bssrdf->radius);
   /* Surface albedo. */
-  const float3 A = bssrdf->albedo;
-  const float3 s = make_float3(
-      bssrdf_burley_fitting(A.x), bssrdf_burley_fitting(A.y), bssrdf_burley_fitting(A.z));
+  const Spectrum A = bssrdf->albedo;
+  Spectrum s;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    GET_SPECTRUM_CHANNEL(s, i) = bssrdf_burley_fitting(GET_SPECTRUM_CHANNEL(A, i));
+  }
 
   bssrdf->radius = l / s;
 }
@@ -198,22 +201,18 @@ ccl_device void bssrdf_burley_sample(const float d,
   *h = safe_sqrtf(Rm * Rm - r_ * r_);
 }
 
-ccl_device float bssrdf_num_channels(const float3 radius)
+ccl_device float bssrdf_num_channels(const Spectrum radius)
 {
   float channels = 0;
-  if (radius.x > 0.0f) {
-    channels += 1.0f;
-  }
-  if (radius.y > 0.0f) {
-    channels += 1.0f;
-  }
-  if (radius.z > 0.0f) {
-    channels += 1.0f;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    if (GET_SPECTRUM_CHANNEL(radius, i) > 0.0f) {
+      channels += 1.0f;
+    }
   }
   return channels;
 }
 
-ccl_device void bssrdf_sample(const float3 radius,
+ccl_device void bssrdf_sample(const Spectrum radius,
                               float xi,
                               ccl_private float *r,
                               ccl_private float *h)
@@ -224,39 +223,44 @@ ccl_device void bssrdf_sample(const float3 radius,
   /* Sample color channel and reuse random number. Only a subset of channels
    * may be used if their radius was too small to handle as BSSRDF. */
   xi *= num_channels;
-
-  if (xi < 1.0f) {
-    sampled_radius = (radius.x > 0.0f) ? radius.x : (radius.y > 0.0f) ? radius.y : radius.z;
-  }
-  else if (xi < 2.0f) {
-    xi -= 1.0f;
-    sampled_radius = (radius.x > 0.0f && radius.y > 0.0f) ? radius.y : radius.z;
-  }
-  else {
-    xi -= 2.0f;
-    sampled_radius = radius.z;
+  sampled_radius = 0.0f;
+
+  float sum = 0.0f;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    const float channel_radius = GET_SPECTRUM_CHANNEL(radius, i);
+    if (channel_radius > 0.0f) {
+      const float next_sum = sum + 1.0f;
+      if (xi < next_sum) {
+        xi -= sum;
+        sampled_radius = channel_radius;
+        break;
+      }
+      sum = next_sum;
+    }
   }
 
   /* Sample BSSRDF. */
   bssrdf_burley_sample(sampled_radius, xi, r, h);
 }
 
-ccl_device_forceinline float3 bssrdf_eval(const float3 radius, float r)
+ccl_device_forceinline Spectrum bssrdf_eval(const Spectrum radius, float r)
 {
-  return make_float3(bssrdf_burley_pdf(radius.x, r),
-                     bssrdf_burley_pdf(radius.y, r),
-                     bssrdf_burley_pdf(radius.z, r));
+  Spectrum result;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    GET_SPECTRUM_CHANNEL(result, i) = bssrdf_burley_pdf(GET_SPECTRUM_CHANNEL(radius, i), r);
+  }
+  return result;
 }
 
-ccl_device_forceinline float bssrdf_pdf(const float3 radius, float r)
+ccl_device_forceinline float bssrdf_pdf(const Spectrum radius, float r)
 {
-  float3 pdf = bssrdf_eval(radius, r);
-  return (pdf.x + pdf.y + pdf.z) / bssrdf_num_channels(radius);
+  Spectrum pdf = bssrdf_eval(radius, r);
+  return reduce_add(pdf) / bssrdf_num_channels(radius);
 }
 
 /* Setup */
 
-ccl_device_inline ccl_private Bssrdf *bssrdf_alloc(ccl_private ShaderData *sd, float3 weight)
+ccl_device_inline ccl_private Bssrdf *bssrdf_alloc(ccl_private ShaderData *sd, Spectrum weight)
 {
   ccl_private Bssrdf *bssrdf = (ccl_private Bssrdf *)closure_alloc(
       sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
@@ -294,31 +298,20 @@ ccl_device int bssrdf_setup(ccl_private ShaderData *sd,
   }
 
   /* Verify if the radii are large enough to sample without precision issues. */
-  int bssrdf_channels = 3;
-  float3 diffuse_weight = make_float3(0.0f, 0.0f, 0.0f);
-
-  if (bssrdf->radius.x < BSSRDF_MIN_RADIUS) {
-    diffuse_weight.x = bssrdf->weight.x;
-    bssrdf->weight.x = 0.0f;
-    bssrdf->radius.x = 0.0f;
-    bssrdf_channels--;
-  }
-  if (bssrdf->radius.y < BSSRDF_MIN_RADIUS) {
-    diffuse_weight.y = bssrdf->weight.y;
-    bssrdf->weight.y = 0.0f;
-    bssrdf->radius.y = 0.0f;
-    bssrdf_channels--;
-  }
-  if (bssrdf->radius.z < BSSRDF_MIN_RADIUS) {
-    diffuse_weight.z = bssrdf->weight.z;
-    bssrdf->weight.z = 0.0f;
-    bssrdf->radius.z = 0.0f;
-    bssrdf_channels--;
+  int bssrdf_channels = SPECTRUM_CHANNELS;
+  Spectrum diffuse_weight = zero_spectrum();
+
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    if (GET_SPECTRUM_CHANNEL(bssrdf->radius, i) < BSSRDF_MIN_RADIUS) {
+      GET_SPECTRUM_CHANNEL(diffuse_weight, i) = GET_SPECTRUM_CHANNEL(bssrdf->weight, i);
+      GET_SPECTRUM_CHANNEL(bssrdf->weight, i) = 0.0f;
+      GET_SPECTRUM_CHANNEL(bssrdf->radius, i) = 0.0f;
+      bssrdf_channels--;
+    }
   }
 
-  if (bssrdf_channels < 3) {
+  if (bssrdf_channels < SPECTRUM_CHANNELS) {
     /* Add diffuse BSDF if any radius too small. */
-#ifdef __PRINCIPLED__
     if (bssrdf->roughness != FLT_MAX) {
       ccl_private PrincipledDiffuseBsdf *bsdf = (ccl_private PrincipledDiffuseBsdf *)bsdf_alloc(
           sd, sizeof(PrincipledDiffuseBsdf), diffuse_weight);
@@ -329,9 +322,7 @@ ccl_device int bssrdf_setup(ccl_private ShaderData *sd,
         flag |= bsdf_principled_diffuse_setup(bsdf, PRINCIPLED_DIFFUSE_LAMBERT);
       }
     }
-    else
-#endif /* __PRINCIPLED__ */
-    {
+    else {
       ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
           sd, sizeof(DiffuseBsdf), diffuse_weight);
 
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index 03e19cbde21..d896721f77b 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -12,7 +12,7 @@ CCL_NAMESPACE_BEGIN
 
 /* BACKGROUND CLOSURE */
 
-ccl_device void background_setup(ccl_private ShaderData *sd, const float3 weight)
+ccl_device void background_setup(ccl_private ShaderData *sd, const Spectrum weight)
 {
   if (sd->flag & SD_EMISSION) {
     sd->closure_emission_background += weight;
@@ -25,7 +25,7 @@ ccl_device void background_setup(ccl_private ShaderData *sd, const float3 weight
 
 /* EMISSION CLOSURE */
 
-ccl_device void emission_setup(ccl_private ShaderData *sd, const float3 weight)
+ccl_device void emission_setup(ccl_private ShaderData *sd, const Spectrum weight)
 {
   if (sd->flag & SD_EMISSION) {
     sd->closure_emission_background += weight;
@@ -54,11 +54,11 @@ ccl_device void emissive_sample(const float3 Ng,
   /* todo: not implemented and used yet */
 }
 
-ccl_device float3 emissive_simple_eval(const float3 Ng, const float3 I)
+ccl_device Spectrum emissive_simple_eval(const float3 Ng, const float3 I)
 {
   float res = emissive_pdf(Ng, I);
 
-  return make_float3(res, res, res);
+  return make_spectrum(res);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 6e24b60af39..9dbb5154457 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -7,7 +7,7 @@ CCL_NAMESPACE_BEGIN
 
 /* VOLUME EXTINCTION */
 
-ccl_device void volume_extinction_setup(ccl_private ShaderData *sd, float3 weight)
+ccl_device void volume_extinction_setup(ccl_private ShaderData *sd, Spectrum weight)
 {
   if (sd->flag & SD_EXTINCTION) {
     sd->closure_transparent_extinction += weight;
@@ -48,10 +48,10 @@ ccl_device int volume_henyey_greenstein_setup(ccl_private HenyeyGreensteinVolume
   return SD_SCATTER;
 }
 
-ccl_device float3 volume_henyey_greenstein_eval_phase(ccl_private const ShaderVolumeClosure *svc,
-                                                      const float3 I,
-                                                      float3 omega_in,
-                                                      ccl_private float *pdf)
+ccl_device Spectrum volume_henyey_greenstein_eval_phase(ccl_private const ShaderVolumeClosure *svc,
+                                                        const float3 I,
+                                                        float3 omega_in,
+                                                        ccl_private float *pdf)
 {
   float g = svc->g;
 
@@ -64,7 +64,7 @@ ccl_device float3 volume_henyey_greenstein_eval_phase(ccl_private const ShaderVo
     *pdf = single_peaked_henyey_greenstein(cos_theta, g);
   }
 
-  return make_float3(*pdf, *pdf, *pdf);
+  return make_spectrum(*pdf);
 }
 
 ccl_device float3
@@ -101,37 +101,27 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, ccl_priva
 
 ccl_device int volume_henyey_greenstein_sample(ccl_private const ShaderVolumeClosure *svc,
                                                float3 I,
-                                               float3 dIdx,
-                                               float3 dIdy,
                                                float randu,
                                                float randv,
-                                               ccl_private float3 *eval,
+                                               ccl_private Spectrum *eval,
                                                ccl_private float3 *omega_in,
-                                               ccl_private float3 *domega_in_dx,
-                                               ccl_private float3 *domega_in_dy,
                                                ccl_private float *pdf)
 {
   float g = svc->g;
 
   /* note that I points towards the viewer and so is used negated */
   *omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf);
-  *eval = make_float3(*pdf, *pdf, *pdf); /* perfect importance sampling */
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* todo: implement ray differential estimation */
-  *domega_in_dx = make_float3(0.0f, 0.0f, 0.0f);
-  *domega_in_dy = make_float3(0.0f, 0.0f, 0.0f);
-#endif
+  *eval = make_spectrum(*pdf); /* perfect importance sampling */
 
   return LABEL_VOLUME_SCATTER;
 }
 
 /* VOLUME CLOSURE */
 
-ccl_device float3 volume_phase_eval(ccl_private const ShaderData *sd,
-                                    ccl_private const ShaderVolumeClosure *svc,
-                                    float3 omega_in,
-                                    ccl_private float *pdf)
+ccl_device Spectrum volume_phase_eval(ccl_private const ShaderData *sd,
+                                      ccl_private const ShaderVolumeClosure *svc,
+                                      float3 omega_in,
+                                      ccl_private float *pdf)
 {
   return volume_henyey_greenstein_eval_phase(svc, sd->I, omega_in, pdf);
 }
@@ -140,22 +130,11 @@ ccl_device int volume_phase_sample(ccl_private const ShaderData *sd,
                                    ccl_private const ShaderVolumeClosure *svc,
                                    float randu,
                                    float randv,
-                                   ccl_private float3 *eval,
+                                   ccl_private Spectrum *eval,
                                    ccl_private float3 *omega_in,
-                                   ccl_private differential3 *domega_in,
                                    ccl_private float *pdf)
 {
-  return volume_henyey_greenstein_sample(svc,
-                                         sd->I,
-                                         sd->dI.dx,
-                                         sd->dI.dy,
-                                         randu,
-                                         randv,
-                                         eval,
-                                         omega_in,
-                                         &domega_in->dx,
-                                         &domega_in->dy,
-                                         pdf);
+  return volume_henyey_greenstein_sample(svc, sd->I, randu, randv, eval, omega_in, pdf);
 }
 
 /* Volume sampling utilities. */
@@ -164,45 +143,44 @@ ccl_device int volume_phase_sample(ccl_private const ShaderData *sd,
  * unnecessary work in volumes and subsurface scattering. */
 #define VOLUME_THROUGHPUT_EPSILON 1e-6f
 
-ccl_device float3 volume_color_transmittance(float3 sigma, float t)
+ccl_device Spectrum volume_color_transmittance(Spectrum sigma, float t)
 {
-  return exp3(-sigma * t);
+  return exp(-sigma * t);
 }
 
-ccl_device float volume_channel_get(float3 value, int channel)
+ccl_device float volume_channel_get(Spectrum value, int channel)
 {
-  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
+  return GET_SPECTRUM_CHANNEL(value, channel);
 }
 
-ccl_device int volume_sample_channel(float3 albedo,
-                                     float3 throughput,
+ccl_device int volume_sample_channel(Spectrum albedo,
+                                     Spectrum throughput,
                                      float rand,
-                                     ccl_private float3 *pdf)
+                                     ccl_private Spectrum *pdf)
 {
   /* Sample color channel proportional to throughput and single scattering
    * albedo, to significantly reduce noise with many bounce, following:
    *
    * "Practical and Controllable Subsurface Scattering for Production Path
    *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-  float3 weights = fabs(throughput * albedo);
-  float sum_weights = weights.x + weights.y + weights.z;
+  Spectrum weights = fabs(throughput * albedo);
+  float sum_weights = reduce_add(weights);
 
   if (sum_weights > 0.0f) {
     *pdf = weights / sum_weights;
   }
   else {
-    *pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
+    *pdf = make_spectrum(1.0f / SPECTRUM_CHANNELS);
   }
 
-  if (rand < pdf->x) {
-    return 0;
-  }
-  else if (rand < pdf->x + pdf->y) {
-    return 1;
-  }
-  else {
-    return 2;
+  float pdf_sum = 0.0f;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    pdf_sum += GET_SPECTRUM_CHANNEL(*pdf, i);
+    if (rand < pdf_sum) {
+      return i;
+    }
   }
+  return SPECTRUM_CHANNELS - 1;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/data_arrays.h b/intern/cycles/kernel/data_arrays.h
new file mode 100644
index 00000000000..f2877e6c37f
--- /dev/null
+++ b/intern/cycles/kernel/data_arrays.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef KERNEL_DATA_ARRAY
+#  define KERNEL_DATA_ARRAY(type, name)
+#endif
+
+/* BVH2, not used for OptiX or Embree. */
+KERNEL_DATA_ARRAY(float4, bvh_nodes)
+KERNEL_DATA_ARRAY(float4, bvh_leaf_nodes)
+KERNEL_DATA_ARRAY(uint, prim_type)
+KERNEL_DATA_ARRAY(uint, prim_visibility)
+KERNEL_DATA_ARRAY(uint, prim_index)
+KERNEL_DATA_ARRAY(uint, prim_object)
+KERNEL_DATA_ARRAY(uint, object_node)
+KERNEL_DATA_ARRAY(float2, prim_time)
+
+/* objects */
+KERNEL_DATA_ARRAY(KernelObject, objects)
+KERNEL_DATA_ARRAY(Transform, object_motion_pass)
+KERNEL_DATA_ARRAY(DecomposedTransform, object_motion)
+KERNEL_DATA_ARRAY(uint, object_flag)
+KERNEL_DATA_ARRAY(float, object_volume_step)
+KERNEL_DATA_ARRAY(uint, object_prim_offset)
+
+/* cameras */
+KERNEL_DATA_ARRAY(DecomposedTransform, camera_motion)
+
+/* triangles */
+KERNEL_DATA_ARRAY(uint, tri_shader)
+KERNEL_DATA_ARRAY(packed_float3, tri_vnormal)
+KERNEL_DATA_ARRAY(uint4, tri_vindex)
+KERNEL_DATA_ARRAY(uint, tri_patch)
+KERNEL_DATA_ARRAY(float2, tri_patch_uv)
+KERNEL_DATA_ARRAY(packed_float3, tri_verts)
+
+/* curves */
+KERNEL_DATA_ARRAY(KernelCurve, curves)
+KERNEL_DATA_ARRAY(float4, curve_keys)
+KERNEL_DATA_ARRAY(KernelCurveSegment, curve_segments)
+
+/* patches */
+KERNEL_DATA_ARRAY(uint, patches)
+
+/* pointclouds */
+KERNEL_DATA_ARRAY(float4, points)
+KERNEL_DATA_ARRAY(uint, points_shader)
+
+/* attributes */
+KERNEL_DATA_ARRAY(AttributeMap, attributes_map)
+KERNEL_DATA_ARRAY(float, attributes_float)
+KERNEL_DATA_ARRAY(float2, attributes_float2)
+KERNEL_DATA_ARRAY(packed_float3, attributes_float3)
+KERNEL_DATA_ARRAY(float4, attributes_float4)
+KERNEL_DATA_ARRAY(uchar4, attributes_uchar4)
+
+/* lights */
+KERNEL_DATA_ARRAY(KernelLightDistribution, light_distribution)
+KERNEL_DATA_ARRAY(KernelLight, lights)
+KERNEL_DATA_ARRAY(float2, light_background_marginal_cdf)
+KERNEL_DATA_ARRAY(float2, light_background_conditional_cdf)
+
+/* particles */
+KERNEL_DATA_ARRAY(KernelParticle, particles)
+
+/* shaders */
+KERNEL_DATA_ARRAY(uint4, svm_nodes)
+KERNEL_DATA_ARRAY(KernelShader, shaders)
+
+/* lookup tables */
+KERNEL_DATA_ARRAY(float, lookup_table)
+
+/* PMJ sample pattern */
+KERNEL_DATA_ARRAY(float, sample_pattern_lut)
+
+/* image textures */
+KERNEL_DATA_ARRAY(TextureInfo, texture_info)
+
+/* ies lights */
+KERNEL_DATA_ARRAY(float, ies)
+
+#undef KERNEL_DATA_ARRAY
diff --git a/intern/cycles/kernel/data_template.h b/intern/cycles/kernel/data_template.h
new file mode 100644
index 00000000000..807d0650fc3
--- /dev/null
+++ b/intern/cycles/kernel/data_template.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef KERNEL_STRUCT_BEGIN
+#  define KERNEL_STRUCT_BEGIN(name, parent)
+#endif
+#ifndef KERNEL_STRUCT_END
+#  define KERNEL_STRUCT_END(name)
+#endif
+#ifndef KERNEL_STRUCT_MEMBER
+#  define KERNEL_STRUCT_MEMBER(parent, type, name)
+#endif
+
+/* Background. */
+
+KERNEL_STRUCT_BEGIN(KernelBackground, background)
+/* xyz store direction, w the angle. float4 instead of float3 is used
+ * to ensure consistent padding/alignment across devices. */
+KERNEL_STRUCT_MEMBER(background, float4, sun)
+/* Only shader index. */
+KERNEL_STRUCT_MEMBER(background, int, surface_shader)
+KERNEL_STRUCT_MEMBER(background, int, volume_shader)
+KERNEL_STRUCT_MEMBER(background, float, volume_step_size)
+KERNEL_STRUCT_MEMBER(background, int, transparent)
+KERNEL_STRUCT_MEMBER(background, float, transparent_roughness_squared_threshold)
+/* Portal sampling. */
+KERNEL_STRUCT_MEMBER(background, float, portal_weight)
+KERNEL_STRUCT_MEMBER(background, int, num_portals)
+KERNEL_STRUCT_MEMBER(background, int, portal_offset)
+/* Sun sampling. */
+KERNEL_STRUCT_MEMBER(background, float, sun_weight)
+/* Importance map sampling. */
+KERNEL_STRUCT_MEMBER(background, float, map_weight)
+KERNEL_STRUCT_MEMBER(background, int, map_res_x)
+KERNEL_STRUCT_MEMBER(background, int, map_res_y)
+/* Multiple importance sampling. */
+KERNEL_STRUCT_MEMBER(background, int, use_mis)
+/* Lightgroup. */
+KERNEL_STRUCT_MEMBER(background, int, lightgroup)
+/* Padding. */
+KERNEL_STRUCT_MEMBER(background, int, pad1)
+KERNEL_STRUCT_MEMBER(background, int, pad2)
+KERNEL_STRUCT_MEMBER(background, int, pad3)
+KERNEL_STRUCT_END(KernelBackground)
+
+/* BVH: own BVH2 if no native device acceleration struct used. */
+
+KERNEL_STRUCT_BEGIN(KernelBVH, bvh)
+KERNEL_STRUCT_MEMBER(bvh, int, root)
+KERNEL_STRUCT_MEMBER(bvh, int, have_motion)
+KERNEL_STRUCT_MEMBER(bvh, int, have_curves)
+KERNEL_STRUCT_MEMBER(bvh, int, bvh_layout)
+KERNEL_STRUCT_MEMBER(bvh, int, use_bvh_steps)
+KERNEL_STRUCT_MEMBER(bvh, int, curve_subdivisions)
+KERNEL_STRUCT_MEMBER(bvh, int, pad1)
+KERNEL_STRUCT_MEMBER(bvh, int, pad2)
+KERNEL_STRUCT_END(KernelBVH)
+
+/* Film. */
+
+KERNEL_STRUCT_BEGIN(KernelFilm, film)
+/* XYZ to rendering color space transform. float4 instead of float3 to
+ * ensure consistent padding/alignment across devices. */
+KERNEL_STRUCT_MEMBER(film, float4, xyz_to_r)
+KERNEL_STRUCT_MEMBER(film, float4, xyz_to_g)
+KERNEL_STRUCT_MEMBER(film, float4, xyz_to_b)
+KERNEL_STRUCT_MEMBER(film, float4, rgb_to_y)
+/* Rec709 to rendering color space. */
+KERNEL_STRUCT_MEMBER(film, float4, rec709_to_r)
+KERNEL_STRUCT_MEMBER(film, float4, rec709_to_g)
+KERNEL_STRUCT_MEMBER(film, float4, rec709_to_b)
+KERNEL_STRUCT_MEMBER(film, int, is_rec709)
+/* Exposure. */
+KERNEL_STRUCT_MEMBER(film, float, exposure)
+/* Passed used. */
+KERNEL_STRUCT_MEMBER(film, int, pass_flag)
+KERNEL_STRUCT_MEMBER(film, int, light_pass_flag)
+/* Pass offsets. */
+KERNEL_STRUCT_MEMBER(film, int, pass_stride)
+KERNEL_STRUCT_MEMBER(film, int, pass_combined)
+KERNEL_STRUCT_MEMBER(film, int, pass_depth)
+KERNEL_STRUCT_MEMBER(film, int, pass_position)
+KERNEL_STRUCT_MEMBER(film, int, pass_normal)
+KERNEL_STRUCT_MEMBER(film, int, pass_roughness)
+KERNEL_STRUCT_MEMBER(film, int, pass_motion)
+KERNEL_STRUCT_MEMBER(film, int, pass_motion_weight)
+KERNEL_STRUCT_MEMBER(film, int, pass_uv)
+KERNEL_STRUCT_MEMBER(film, int, pass_object_id)
+KERNEL_STRUCT_MEMBER(film, int, pass_material_id)
+KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_glossy_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_transmission_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_glossy_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_transmission_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_glossy_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_transmission_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_emission)
+KERNEL_STRUCT_MEMBER(film, int, pass_background)
+KERNEL_STRUCT_MEMBER(film, int, pass_ao)
+KERNEL_STRUCT_MEMBER(film, float, pass_alpha_threshold)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow)
+KERNEL_STRUCT_MEMBER(film, float, pass_shadow_scale)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_sample_count)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_matte)
+/* Cryptomatte. */
+KERNEL_STRUCT_MEMBER(film, int, cryptomatte_passes)
+KERNEL_STRUCT_MEMBER(film, int, cryptomatte_depth)
+KERNEL_STRUCT_MEMBER(film, int, pass_cryptomatte)
+/* Adaptive sampling. */
+KERNEL_STRUCT_MEMBER(film, int, pass_adaptive_aux_buffer)
+KERNEL_STRUCT_MEMBER(film, int, pass_sample_count)
+/* Mist. */
+KERNEL_STRUCT_MEMBER(film, int, pass_mist)
+KERNEL_STRUCT_MEMBER(film, float, mist_start)
+KERNEL_STRUCT_MEMBER(film, float, mist_inv_depth)
+KERNEL_STRUCT_MEMBER(film, float, mist_falloff)
+/* Denoising. */
+KERNEL_STRUCT_MEMBER(film, int, pass_denoising_normal)
+KERNEL_STRUCT_MEMBER(film, int, pass_denoising_albedo)
+KERNEL_STRUCT_MEMBER(film, int, pass_denoising_depth)
+/* AOVs. */
+KERNEL_STRUCT_MEMBER(film, int, pass_aov_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_aov_value)
+/* Light groups. */
+KERNEL_STRUCT_MEMBER(film, int, pass_lightgroup)
+/* Baking. */
+KERNEL_STRUCT_MEMBER(film, int, pass_bake_primitive)
+KERNEL_STRUCT_MEMBER(film, int, pass_bake_differential)
+/* Shadow catcher. */
+KERNEL_STRUCT_MEMBER(film, int, use_approximate_shadow_catcher)
+/* Padding. */
+KERNEL_STRUCT_MEMBER(film, int, pad1)
+KERNEL_STRUCT_MEMBER(film, int, pad2)
+KERNEL_STRUCT_END(KernelFilm)
+
+/* Integrator. */
+
+KERNEL_STRUCT_BEGIN(KernelIntegrator, integrator)
+/* Emission. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_direct_light)
+KERNEL_STRUCT_MEMBER(integrator, int, num_distribution)
+KERNEL_STRUCT_MEMBER(integrator, int, num_all_lights)
+KERNEL_STRUCT_MEMBER(integrator, float, pdf_triangles)
+KERNEL_STRUCT_MEMBER(integrator, float, pdf_lights)
+KERNEL_STRUCT_MEMBER(integrator, float, light_inv_rr_threshold)
+/* Bounces. */
+KERNEL_STRUCT_MEMBER(integrator, int, min_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_diffuse_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_glossy_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_transmission_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_volume_bounce)
+/* AO bounces. */
+KERNEL_STRUCT_MEMBER(integrator, int, ao_bounces)
+KERNEL_STRUCT_MEMBER(integrator, float, ao_bounces_distance)
+KERNEL_STRUCT_MEMBER(integrator, float, ao_bounces_factor)
+KERNEL_STRUCT_MEMBER(integrator, float, ao_additive_factor)
+/* Transparency. */
+KERNEL_STRUCT_MEMBER(integrator, int, transparent_min_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, transparent_max_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, transparent_shadows)
+/* Caustics. */
+KERNEL_STRUCT_MEMBER(integrator, int, caustics_reflective)
+KERNEL_STRUCT_MEMBER(integrator, int, caustics_refractive)
+KERNEL_STRUCT_MEMBER(integrator, float, filter_glossy)
+/* Seed. */
+KERNEL_STRUCT_MEMBER(integrator, int, seed)
+/* Clamp. */
+KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_direct)
+KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_indirect)
+/* MIS. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_lamp_mis)
+/* Caustics. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_caustics)
+/* Sampling pattern. */
+KERNEL_STRUCT_MEMBER(integrator, int, sampling_pattern)
+KERNEL_STRUCT_MEMBER(integrator, float, scrambling_distance)
+/* Volume render. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_volumes)
+KERNEL_STRUCT_MEMBER(integrator, int, volume_max_steps)
+KERNEL_STRUCT_MEMBER(integrator, float, volume_step_rate)
+/* Shadow catcher. */
+KERNEL_STRUCT_MEMBER(integrator, int, has_shadow_catcher)
+/* Closure filter. */
+KERNEL_STRUCT_MEMBER(integrator, int, filter_closures)
+/* MIS debugging. */
+KERNEL_STRUCT_MEMBER(integrator, int, direct_light_sampling_type)
+/* Padding */
+KERNEL_STRUCT_MEMBER(integrator, int, pad1)
+KERNEL_STRUCT_END(KernelIntegrator)
+
+/* SVM. For shader specialization. */
+
+KERNEL_STRUCT_BEGIN(KernelSVMUsage, svm_usage)
+#define SHADER_NODE_TYPE(type) KERNEL_STRUCT_MEMBER(svm_usage, int, type)
+#include "kernel/svm/node_types_template.h"
+KERNEL_STRUCT_END(KernelSVMUsage)
+
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_END
diff --git a/intern/cycles/kernel/device/cpu/bvh.h b/intern/cycles/kernel/device/cpu/bvh.h
new file mode 100644
index 00000000000..d9267e1cd6d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -0,0 +1,582 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* CPU Embree implementation of ray-scene intersection. */
+
+#pragma once
+
+#include <embree3/rtcore_ray.h>
+#include <embree3/rtcore_scene.h>
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+#include "kernel/geom/object.h"
+#include "kernel/integrator/state.h"
+#include "kernel/sample/lcg.h"
+
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define EMBREE_IS_HAIR(x) (x & 1)
+
+/* Intersection context. */
+
+struct CCLIntersectContext {
+  typedef enum {
+    RAY_REGULAR = 0,
+    RAY_SHADOW_ALL = 1,
+    RAY_LOCAL = 2,
+    RAY_SSS = 3,
+    RAY_VOLUME_ALL = 4,
+  } RayType;
+
+  KernelGlobals kg;
+  RayType type;
+
+  /* For avoiding self intersections */
+  const Ray *ray;
+
+  /* for shadow rays */
+  Intersection *isect_s;
+  uint max_hits;
+  uint num_hits;
+  uint num_recorded_hits;
+  float throughput;
+  float max_t;
+  bool opaque_hit;
+
+  /* for SSS Rays: */
+  LocalIntersection *local_isect;
+  int local_object_id;
+  uint *lcg_state;
+
+  CCLIntersectContext(KernelGlobals kg_, RayType type_)
+  {
+    kg = kg_;
+    type = type_;
+    ray = NULL;
+    max_hits = 1;
+    num_hits = 0;
+    num_recorded_hits = 0;
+    throughput = 1.0f;
+    max_t = FLT_MAX;
+    opaque_hit = false;
+    isect_s = NULL;
+    local_isect = NULL;
+    local_object_id = -1;
+    lcg_state = NULL;
+  }
+};
+
+class IntersectContext {
+ public:
+  IntersectContext(CCLIntersectContext *ctx)
+  {
+    rtcInitIntersectContext(&context);
+    userRayExt = ctx;
+  }
+  RTCIntersectContext context;
+  CCLIntersectContext *userRayExt;
+};
+
+/* Utilities. */
+
+ccl_device_inline void kernel_embree_setup_ray(const Ray &ray,
+                                               RTCRay &rtc_ray,
+                                               const uint visibility)
+{
+  rtc_ray.org_x = ray.P.x;
+  rtc_ray.org_y = ray.P.y;
+  rtc_ray.org_z = ray.P.z;
+  rtc_ray.dir_x = ray.D.x;
+  rtc_ray.dir_y = ray.D.y;
+  rtc_ray.dir_z = ray.D.z;
+  rtc_ray.tnear = ray.tmin;
+  rtc_ray.tfar = ray.tmax;
+  rtc_ray.time = ray.time;
+  rtc_ray.mask = visibility;
+}
+
+ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
+                                                  RTCRayHit &rayhit,
+                                                  const uint visibility)
+{
+  kernel_embree_setup_ray(ray, rayhit.ray, visibility);
+  rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+  rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg,
+                                                          const RTCHit *hit,
+                                                          const Ray *ray)
+{
+  int object, prim;
+
+  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+    object = hit->instID[0] / 2;
+    if ((ray->self.object == object) || (ray->self.light_object == object)) {
+      RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+          rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
+      prim = hit->primID +
+             (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+    }
+    else {
+      return false;
+    }
+  }
+  else {
+    object = hit->geomID / 2;
+    if ((ray->self.object == object) || (ray->self.light_object == object)) {
+      prim = hit->primID +
+             (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
+    }
+    else {
+      return false;
+    }
+  }
+
+  const bool is_hair = hit->geomID & 1;
+  if (is_hair) {
+    prim = kernel_data_fetch(curve_segments, prim).prim;
+  }
+
+  return intersection_skip_self_shadow(ray->self, object, prim);
+}
+
+ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg,
+                                                 const RTCRay *ray,
+                                                 const RTCHit *hit,
+                                                 Intersection *isect)
+{
+  isect->t = ray->tfar;
+  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+    RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+        rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
+    isect->prim = hit->primID +
+                  (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+    isect->object = hit->instID[0] / 2;
+  }
+  else {
+    isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(
+                                    rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
+    isect->object = hit->geomID / 2;
+  }
+
+  const bool is_hair = hit->geomID & 1;
+  if (is_hair) {
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, isect->prim);
+    isect->type = segment.type;
+    isect->prim = segment.prim;
+    isect->u = hit->u;
+    isect->v = hit->v;
+  }
+  else {
+    isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
+    isect->u = hit->u;
+    isect->v = hit->v;
+  }
+}
+
+ccl_device_inline void kernel_embree_convert_sss_hit(
+    KernelGlobals kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int object)
+{
+  isect->u = hit->u;
+  isect->v = hit->v;
+  isect->t = ray->tfar;
+  RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+      rtcGetGeometry(kernel_data.device_bvh, object * 2));
+  isect->prim = hit->primID +
+                (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+  isect->object = object;
+  isect->type = kernel_data_fetch(objects, object).primitive_type;
+}
+
+/* Ray filter functions. */
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls. */
+ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args)
+{
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
+  assert(args->N == 1);
+
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
+}
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls.
+ */
+ccl_device void kernel_embree_filter_occluded_func(const RTCFilterFunctionNArguments *args)
+{
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
+  assert(args->N == 1);
+
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  switch (ctx->type) {
+    case CCLIntersectContext::RAY_SHADOW_ALL: {
+      Intersection current_isect;
+      kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+      if (intersection_skip_self_shadow(cray->self, current_isect.object, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
+      /* If no transparent shadows or max number of hits exceeded, all light is blocked. */
+      const int flags = intersection_get_shader_flags(kg, current_isect.prim, current_isect.type);
+      if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->num_hits >= ctx->max_hits) {
+        ctx->opaque_hit = true;
+        return;
+      }
+
+      ++ctx->num_hits;
+
+      /* Always use baked shadow transparency for curves. */
+      if (current_isect.type & PRIMITIVE_CURVE) {
+        ctx->throughput *= intersection_curve_shadow_transparency(
+            kg, current_isect.object, current_isect.prim, current_isect.u);
+
+        if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+          ctx->opaque_hit = true;
+          return;
+        }
+        else {
+          *args->valid = 0;
+          return;
+        }
+      }
+
+      /* Test if we need to record this transparent intersection. */
+      const uint max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+      if (ctx->num_recorded_hits < max_record_hits || ray->tfar < ctx->max_t) {
+        /* If maximum number of hits was reached, replace the intersection with the
+         * highest distance. We want to find the N closest intersections. */
+        const uint num_recorded_hits = min(ctx->num_recorded_hits, max_record_hits);
+        uint isect_index = num_recorded_hits;
+        if (num_recorded_hits + 1 >= max_record_hits) {
+          float max_t = ctx->isect_s[0].t;
+          uint max_recorded_hit = 0;
+
+          for (uint i = 1; i < num_recorded_hits; ++i) {
+            if (ctx->isect_s[i].t > max_t) {
+              max_recorded_hit = i;
+              max_t = ctx->isect_s[i].t;
+            }
+          }
+
+          if (num_recorded_hits >= max_record_hits) {
+            isect_index = max_recorded_hit;
+          }
+
+          /* Limit the ray distance and stop counting hits beyond this.
+           * TODO: is there some way we can tell Embree to stop intersecting beyond
+           * this distance when max number of hits is reached?. Or maybe it will
+           * become irrelevant if we make max_hits a very high number on the CPU. */
+          ctx->max_t = max(current_isect.t, max_t);
+        }
+
+        ctx->isect_s[isect_index] = current_isect;
+      }
+
+      /* Always increase the number of recorded hits, even beyond the maximum,
+       * so that we can detect this and trace another ray if needed. */
+      ++ctx->num_recorded_hits;
+
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
+      break;
+    }
+    case CCLIntersectContext::RAY_LOCAL:
+    case CCLIntersectContext::RAY_SSS: {
+      /* Check if it's hitting the correct object. */
+      Intersection current_isect;
+      if (ctx->type == CCLIntersectContext::RAY_SSS) {
+        kernel_embree_convert_sss_hit(kg, ray, hit, &current_isect, ctx->local_object_id);
+      }
+      else {
+        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (ctx->local_object_id != current_isect.object) {
+          /* This tells Embree to continue tracing. */
+          *args->valid = 0;
+          break;
+        }
+      }
+      if (intersection_skip_self_local(cray->self, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
+
+      /* No intersection information requested, just return a hit. */
+      if (ctx->max_hits == 0) {
+        break;
+      }
+
+      /* Ignore curves. */
+      if (EMBREE_IS_HAIR(hit->geomID)) {
+        /* This tells Embree to continue tracing. */
+        *args->valid = 0;
+        break;
+      }
+
+      LocalIntersection *local_isect = ctx->local_isect;
+      int hit_idx = 0;
+
+      if (ctx->lcg_state) {
+        /* See triangle_intersect_subsurface() for the native equivalent. */
+        for (int i = min((int)ctx->max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+          if (local_isect->hits[i].t == ray->tfar) {
+            /* This tells Embree to continue tracing. */
+            *args->valid = 0;
+            return;
+          }
+        }
+
+        local_isect->num_hits++;
+
+        if (local_isect->num_hits <= ctx->max_hits) {
+          hit_idx = local_isect->num_hits - 1;
+        }
+        else {
+          /* reservoir sampling: if we are at the maximum number of
+           * hits, randomly replace element or skip it */
+          hit_idx = lcg_step_uint(ctx->lcg_state) % local_isect->num_hits;
+
+          if (hit_idx >= ctx->max_hits) {
+            /* This tells Embree to continue tracing. */
+            *args->valid = 0;
+            return;
+          }
+        }
+      }
+      else {
+        /* Record closest intersection only. */
+        if (local_isect->num_hits && current_isect.t > local_isect->hits[0].t) {
+          *args->valid = 0;
+          return;
+        }
+
+        local_isect->num_hits = 1;
+      }
+
+      /* record intersection */
+      local_isect->hits[hit_idx] = current_isect;
+      local_isect->Ng[hit_idx] = normalize(make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z));
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
+      break;
+    }
+    case CCLIntersectContext::RAY_VOLUME_ALL: {
+      /* Append the intersection to the end of the array. */
+      if (ctx->num_hits < ctx->max_hits) {
+        Intersection current_isect;
+        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (intersection_skip_self(cray->self, current_isect.object, current_isect.prim)) {
+          *args->valid = 0;
+          return;
+        }
+
+        Intersection *isect = &ctx->isect_s[ctx->num_hits];
+        ++ctx->num_hits;
+        *isect = current_isect;
+        /* Only primitives from volume object. */
+        uint tri_object = isect->object;
+        int object_flag = kernel_data_fetch(object_flag, tri_object);
+        if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+          --ctx->num_hits;
+        }
+        /* This tells Embree to continue tracing. */
+        *args->valid = 0;
+      }
+      break;
+    }
+    case CCLIntersectContext::RAY_REGULAR:
+    default:
+      if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+        *args->valid = 0;
+        return;
+      }
+      break;
+  }
+}
+
+ccl_device void kernel_embree_filter_func_backface_cull(const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore back-facing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
+}
+
+ccl_device void kernel_embree_filter_occluded_func_backface_cull(
+    const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore back-facing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  kernel_embree_filter_occluded_func(args);
+}
+
+/* Scene intersection. */
+
+ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
+                                                  ccl_private const Ray *ray,
+                                                  const uint visibility,
+                                                  ccl_private Intersection *isect)
+{
+  isect->t = ray->tmax;
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
+  IntersectContext rtc_ctx(&ctx);
+  RTCRayHit ray_hit;
+  ctx.ray = ray;
+  kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
+  rtcIntersect1(kernel_data.device_bvh, &rtc_ctx.context, &ray_hit);
+  if (ray_hit.hit.geomID == RTC_INVALID_GEOMETRY_ID ||
+      ray_hit.hit.primID == RTC_INVALID_GEOMETRY_ID) {
+    return false;
+  }
+
+  kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
+  return true;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
+                                                        ccl_private const Ray *ray,
+                                                        ccl_private LocalIntersection *local_isect,
+                                                        int local_object,
+                                                        ccl_private uint *lcg_state,
+                                                        int max_hits)
+{
+  const bool has_bvh = !(kernel_data_fetch(object_flag, local_object) &
+                         SD_OBJECT_TRANSFORM_APPLIED);
+  CCLIntersectContext ctx(kg,
+                          has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
+  ctx.lcg_state = lcg_state;
+  ctx.max_hits = max_hits;
+  ctx.ray = ray;
+  ctx.local_isect = local_isect;
+  if (local_isect) {
+    local_isect->num_hits = 0;
+  }
+  ctx.local_object_id = local_object;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
+
+  /* If this object has its own BVH, use it. */
+  if (has_bvh) {
+    RTCGeometry geom = rtcGetGeometry(kernel_data.device_bvh, local_object * 2);
+    if (geom) {
+      float3 P = ray->P;
+      float3 dir = ray->D;
+      float3 idir = ray->D;
+      bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir);
+
+      rtc_ray.org_x = P.x;
+      rtc_ray.org_y = P.y;
+      rtc_ray.org_z = P.z;
+      rtc_ray.dir_x = dir.x;
+      rtc_ray.dir_y = dir.y;
+      rtc_ray.dir_z = dir.z;
+      rtc_ray.tnear = ray->tmin;
+      rtc_ray.tfar = ray->tmax;
+      RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
+      kernel_assert(scene);
+      if (scene) {
+        rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
+      }
+    }
+  }
+  else {
+    rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+  }
+
+  /* rtcOccluded1 sets tfar to -inf if a hit was found. */
+  return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0);
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
+                                                             IntegratorShadowStateCPU *state,
+                                                             ccl_private const Ray *ray,
+                                                             uint visibility,
+                                                             uint max_hits,
+                                                             ccl_private uint *num_recorded_hits,
+                                                             ccl_private float *throughput)
+{
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
+  Intersection *isect_array = (Intersection *)state->shadow_isect;
+  ctx.isect_s = isect_array;
+  ctx.max_hits = max_hits;
+  ctx.ray = ray;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+  rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+
+  *num_recorded_hits = ctx.num_recorded_hits;
+  *throughput = ctx.throughput;
+  return ctx.opaque_hit;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
+                                                         ccl_private const Ray *ray,
+                                                         ccl_private Intersection *isect,
+                                                         const uint max_hits,
+                                                         const uint visibility)
+{
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
+  ctx.isect_s = isect;
+  ctx.max_hits = max_hits;
+  ctx.num_hits = 0;
+  ctx.ray = ray;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+  rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+  return ctx.num_hits;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
index e1c20169582..1e3e790ca1f 100644
--- a/intern/cycles/kernel/device/cpu/compat.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -3,8 +3,6 @@
 
 #pragma once
 
-#define __KERNEL_CPU__
-
 /* Release kernel has too much false-positive maybe-uninitialized warnings,
  * which makes it possible to miss actual warnings.
  */
@@ -35,52 +33,4 @@ CCL_NAMESPACE_BEGIN
 
 #define kernel_assert(cond) assert(cond)
 
-/* Texture types to be compatible with CUDA textures. These are really just
- * simple arrays and after inlining fetch hopefully revert to being a simple
- * pointer lookup. */
-template<typename T> struct texture {
-  ccl_always_inline const T &fetch(int index) const
-  {
-    kernel_assert(index >= 0 && index < width);
-    return data[index];
-  }
-
-  T *data;
-  int width;
-};
-
-/* Macros to handle different memory storage on different devices */
-
-#ifdef __KERNEL_SSE2__
-typedef vector3<sseb> sse3b;
-typedef vector3<ssef> sse3f;
-typedef vector3<ssei> sse3i;
-
-ccl_device_inline void print_sse3b(const char *label, sse3b &a)
-{
-  print_sseb(label, a.x);
-  print_sseb(label, a.y);
-  print_sseb(label, a.z);
-}
-
-ccl_device_inline void print_sse3f(const char *label, sse3f &a)
-{
-  print_ssef(label, a.x);
-  print_ssef(label, a.y);
-  print_ssef(label, a.z);
-}
-
-ccl_device_inline void print_sse3i(const char *label, sse3i &a)
-{
-  print_ssei(label, a.x);
-  print_ssei(label, a.y);
-  print_ssei(label, a.z);
-}
-
-#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-typedef vector3<avxf> avx3f;
-#  endif
-
-#endif
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index 7e080d428ea..309afae412e 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -12,7 +12,7 @@
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
- * the kernel, to access constant data. These are all stored as "textures", but
+ * the kernel, to access constant data. These are all stored as flat arrays.
  * these are really just standard arrays. We can't use actually globals because
  * multiple renders may be running inside the same process. */
 
@@ -22,11 +22,23 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #endif
 
+/* Array for kernel data, with size to be able to assert on invalid data access. */
+template<typename T> struct kernel_array {
+  ccl_always_inline const T &fetch(int index) const
+  {
+    kernel_assert(index >= 0 && index < width);
+    return data[index];
+  }
+
+  T *data;
+  int width;
+};
+
 typedef struct KernelGlobalsCPU {
-#define KERNEL_TEX(type, name) texture<type> name;
-#include "kernel/textures.h"
+#define KERNEL_DATA_ARRAY(type, name) kernel_array<type> name;
+#include "kernel/data_arrays.h"
 
-  KernelData __data;
+  KernelData data;
 
 #ifdef __OSL__
   /* On the CPU, we also have the OSL globals here. Most data structures are shared
@@ -44,8 +56,8 @@ typedef struct KernelGlobalsCPU {
 typedef const KernelGlobalsCPU *ccl_restrict KernelGlobals;
 
 /* Abstraction macros */
-#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_array(tex) (kg->tex.data)
-#define kernel_data (kg->__data)
+#define kernel_data_fetch(name, index) (kg->name.fetch(index))
+#define kernel_data_array(name) (kg->name.data)
+#define kernel_data (kg->data)
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
index 7809ec5f4a7..320e6309128 100644
--- a/intern/cycles/kernel/device/cpu/image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -733,7 +733,7 @@ template<typename TexT, typename OutT = float4> struct NanoVDBInterpolator {
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
   if (UNLIKELY(!info.data)) {
     return zero_float4();
@@ -776,7 +776,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
   if (UNLIKELY(!info.data)) {
     return zero_float4();
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index b12e3089378..01087c96dd6 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -53,8 +53,8 @@ CCL_NAMESPACE_BEGIN
 
 void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t)
 {
-  if (strcmp(name, "__data") == 0) {
-    kg->__data = *(KernelData *)host;
+  if (strcmp(name, "data") == 0) {
+    kg->data = *(KernelData *)host;
   }
   else {
     assert(0);
@@ -66,13 +66,13 @@ void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem
   if (0) {
   }
 
-#define KERNEL_TEX(type, tname) \
+#define KERNEL_DATA_ARRAY(type, tname) \
   else if (strcmp(name, #tname) == 0) \
   { \
     kg->tname.data = (type *)mem; \
     kg->tname.width = size; \
   }
-#include "kernel/textures.h"
+#include "kernel/data_arrays.h"
   else {
     assert(0);
   }
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 0e5f7b4a2fd..0d7c06f4fc6 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -34,7 +34,7 @@
 #    include "kernel/integrator/megakernel.h"
 
 #    include "kernel/film/adaptive_sampling.h"
-#    include "kernel/film/id_passes.h"
+#    include "kernel/film/cryptomatte_passes.h"
 #    include "kernel/film/read.h"
 
 #    include "kernel/bake/bake.h"
@@ -169,7 +169,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
   STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
   return false;
 #else
-  return kernel_adaptive_sampling_convergence_check(
+  return film_adaptive_sampling_convergence_check(
       kg, render_buffer, x, y, threshold, reset, offset, stride);
 #endif
 }
@@ -185,7 +185,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCP
 #ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
 #else
-  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+  film_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
 #endif
 }
 
@@ -200,7 +200,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCP
 #ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
 #else
-  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+  film_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
 #endif
 }
 
@@ -215,7 +215,7 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *
 #ifdef KERNEL_STUB
   STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
 #else
-  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+  film_cryptomatte_post(kg, render_buffer, pixel_index);
 #endif
 }
 
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
index e77fcd2b424..f5f7bcf58ee 100644
--- a/intern/cycles/kernel/device/cuda/globals.h
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -20,18 +20,24 @@ struct KernelGlobalsGPU {
 };
 typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
-/* Global scene data and textures */
-__constant__ KernelData __data;
-#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#include "kernel/textures.h"
+struct KernelParamsCUDA {
+  /* Global scene data and textures */
+  KernelData data;
+#define KERNEL_DATA_ARRAY(type, name) const type *name;
+#include "kernel/data_arrays.h"
+
+  /* Integrator state */
+  IntegratorStateGPU integrator_state;
+};
 
-/* Integrator state */
-__constant__ IntegratorStateGPU __integrator_state;
+#ifdef __KERNEL_GPU__
+__constant__ KernelParamsCUDA kernel_params;
+#endif
 
 /* Abstraction macros */
-#define kernel_data __data
-#define kernel_tex_fetch(t, index) t[(index)]
-#define kernel_tex_array(t) (t)
-#define kernel_integrator_state __integrator_state
+#define kernel_data kernel_params.data
+#define kernel_data_fetch(name, index) kernel_params.name[(index)]
+#define kernel_data_array(name) (kernel_params.name)
+#define kernel_integrator_state kernel_params.integrator_state
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 29d851ae478..a8c72645569 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -181,7 +181,7 @@ ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_in
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -216,7 +216,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index d657571a5fa..d7d2000775f 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -14,6 +14,8 @@
 
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_begin.h"
+#elif defined(__KERNEL_ONEAPI__)
+#  include "kernel/device/oneapi/context_begin.h"
 #endif
 
 #include "kernel/device/gpu/work_stealing.h"
@@ -40,6 +42,8 @@
 
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_end.h"
+#elif defined(__KERNEL_ONEAPI__)
+#  include "kernel/device/oneapi/context_end.h"
 #endif
 
 #include "kernel/film/read.h"
@@ -242,7 +246,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 ccl_gpu_kernel_postfix
 
 #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
-constant int __dummy_constant [[function_constant(0)]];
+constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
 #endif
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
@@ -522,7 +526,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+    converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check(
         nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
@@ -549,7 +553,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
   if (y < sh) {
     ccl_gpu_kernel_call(
-        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
+        film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 ccl_gpu_kernel_postfix
@@ -568,7 +572,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
   if (x < sw) {
     ccl_gpu_kernel_call(
-        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
+        film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 ccl_gpu_kernel_postfix
@@ -585,7 +589,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
+    ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 ccl_gpu_kernel_postfix
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 7d7266d5edf..c1df49c4f49 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
-#ifndef __KERNEL_METAL__
+/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads,
+ * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices
+ * and keep device specific code in compat.h */
+
+#ifdef __KERNEL_ONEAPI__
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+template<typename IsActiveOp>
+void cpu_serial_active_index_array_impl(const uint num_states,
+                                        ccl_global int *ccl_restrict indices,
+                                        ccl_global int *ccl_restrict num_indices,
+                                        IsActiveOp is_active_op)
+{
+  int write_index = 0;
+  for (int state_index = 0; state_index < num_states; state_index++) {
+    if (is_active_op(state_index))
+      indices[write_index++] = state_index;
+  }
+  *num_indices = write_index;
+  return;
+}
+#  endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
+
+template<typename IsActiveOp>
+void gpu_parallel_active_index_array_impl(const uint num_states,
+                                          ccl_global int *ccl_restrict indices,
+                                          ccl_global int *ccl_restrict num_indices,
+                                          IsActiveOp is_active_op)
+{
+  const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
+  const uint blocksize = item_id.get_local_range(0);
+
+  sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
+                  sycl::access::address_space::local_space>
+      ptr = sycl::ext::oneapi::group_local_memory<
+          int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
+  int *warp_offset = *ptr;
+
+  /* NOTE(@nsirgien): Here we calculate the same value as below but
+   * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
+   * something faster already but DPC++ doesn't, so it's better to use
+   * direct request of needed parameters - switching from this computation to computation below
+   * will cause 2.5x performance slowdown. */
+  const uint thread_index = item_id.get_local_id(0);
+  const uint thread_warp = item_id.get_sub_group().get_local_id();
+
+  const uint warp_index = item_id.get_sub_group().get_group_id();
+  const uint num_warps = item_id.get_sub_group().get_group_range()[0];
+
+  const uint state_index = item_id.get_global_id(0);
+
+  /* Test if state corresponding to this thread is active. */
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#else /* !__KERNEL__ONEAPI__ */
+#  ifndef __KERNEL_METAL__
 template<uint blocksize, typename IsActiveOp>
 __device__
-#endif
+#  endif
     void
     gpu_parallel_active_index_array_impl(const uint num_states,
                                          ccl_global int *indices,
                                          ccl_global int *num_indices,
-#ifdef __KERNEL_METAL__
+#  ifdef __KERNEL_METAL__
                                          const uint is_active,
                                          const uint blocksize,
                                          const int thread_index,
@@ -37,7 +90,7 @@ __device__
                                          const int num_warps,
                                          threadgroup int *warp_offset)
 {
-#else
+#  else
                                           IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -52,18 +105,33 @@ __device__
 
   /* Test if state corresponding to this thread is active. */
   const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
-#endif
-
+#  endif
+#endif /* !__KERNEL_ONEAPI__ */
   /* For each thread within a warp compute how many other active states precede it. */
+#ifdef __KERNEL_ONEAPI__
+  const uint thread_offset = sycl::exclusive_scan_over_group(
+      item_id.get_sub_group(), is_active, std::plus<>());
+#else
   const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
                                       ccl_gpu_thread_mask(thread_warp));
+#endif
 
   /* Last thread in warp stores number of active states for each warp. */
+#ifdef __KERNEL_ONEAPI__
+  if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
+#else
   if (thread_warp == ccl_gpu_warp_size - 1) {
+#endif
     warp_offset[warp_index] = thread_offset + is_active;
   }
 
+#ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#else
   ccl_gpu_syncthreads();
+#endif
 
   /* Last thread in block converts per-warp sizes to offsets, increments global size of
    * index array and gets offset to write to. */
@@ -80,7 +148,13 @@ __device__
     warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
   }
 
+#ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#else
   ccl_gpu_syncthreads();
+#endif
 
   /* Write to index array. */
   if (is_active) {
@@ -107,7 +181,19 @@ __device__
                                          simd_group_index, \
                                          num_simd_groups, \
                                          simdgroup_offset)
-
+#elif defined(__KERNEL_ONEAPI__)
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+#    define gpu_parallel_active_index_array( \
+        blocksize, num_states, indices, num_indices, is_active_op) \
+      if (ccl_gpu_global_size_x() == 1) \
+        cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
+      else \
+        gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
+#  else
+#    define gpu_parallel_active_index_array( \
+        blocksize, num_states, indices, num_indices, is_active_op) \
+      gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+#  endif
 #else
 
 #  define gpu_parallel_active_index_array( \
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 667352ed12e..648988c31b6 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -62,7 +62,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
index 50f117038a2..3a334b21a9e 100644
--- a/intern/cycles/kernel/device/hip/globals.h
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -20,18 +20,24 @@ struct KernelGlobalsGPU {
 };
 typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
-/* Global scene data and textures */
-__constant__ KernelData __data;
-#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
-#include "kernel/textures.h"
+struct KernelParamsHIP {
+  /* Global scene data and textures */
+  KernelData data;
+#define KERNEL_DATA_ARRAY(type, name) const type *name;
+#include "kernel/data_arrays.h"
+
+  /* Integrator state */
+  IntegratorStateGPU integrator_state;
+};
 
-/* Integrator state */
-__constant__ IntegratorStateGPU __integrator_state;
+#ifdef __KERNEL_GPU__
+__constant__ KernelParamsHIP kernel_params;
+#endif
 
 /* Abstraction macros */
-#define kernel_data __data
-#define kernel_tex_fetch(t, index) t[(index)]
-#define kernel_tex_array(t) (t)
-#define kernel_integrator_state __integrator_state
+#define kernel_data kernel_params.data
+#define kernel_data_fetch(name, index) kernel_params.name[(index)]
+#define kernel_data_array(name) (kernel_params.name)
+#define kernel_integrator_state kernel_params.integrator_state
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/bvh.h b/intern/cycles/kernel/device/metal/bvh.h
new file mode 100644
index 00000000000..03faa3f020f
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/bvh.h
@@ -0,0 +1,360 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* MetalRT implementation of ray-scene intersection. */
+
+#pragma once
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Payload types. */
+
+struct MetalRTIntersectionPayload {
+  RaySelfPrimitives self;
+  uint visibility;
+  float u, v;
+  int prim;
+  int type;
+#if defined(__METALRT_MOTION__)
+  float time;
+#endif
+};
+
+struct MetalRTIntersectionLocalPayload {
+  RaySelfPrimitives self;
+  uint local_object;
+  uint lcg_state;
+  short max_hits;
+  bool has_lcg_state;
+  bool result;
+  LocalIntersection local_isect;
+};
+
+struct MetalRTIntersectionShadowPayload {
+  RaySelfPrimitives self;
+  uint visibility;
+#if defined(__METALRT_MOTION__)
+  float time;
+#endif
+  int state;
+  float throughput;
+  short max_hits;
+  short num_hits;
+  short num_recorded_hits;
+  bool result;
+};
+
+/* Scene intersection. */
+
+ccl_device_intersect bool scene_intersect(KernelGlobals kg,
+                                          ccl_private const Ray *ray,
+                                          const uint visibility,
+                                          ccl_private Intersection *isect)
+{
+  if (!intersection_ray_valid(ray)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    return false;
+  }
+
+#if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    kernel_assert(!"Invalid ift_default");
+    return false;
+  }
+#endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
+  payload.u = 0.0f;
+  payload.v = 0.0f;
+  payload.visibility = visibility;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+    /* No further intersector setup required: Default MetalRT behavior is any-hit. */
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    /* No further intersector setup required: Shadow ray early termination is controlled by the
+     * intersection handler */
+  }
+
+#if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_default,
+                                             payload);
+#else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
+#endif
+
+  if (intersection.type == intersection_type::none) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+
+    return false;
+  }
+
+  isect->t = intersection.distance;
+
+  isect->prim = payload.prim;
+  isect->type = payload.type;
+  isect->object = intersection.user_instance_id;
+
+  isect->t = intersection.distance;
+  if (intersection.type == intersection_type::triangle) {
+    isect->u = intersection.triangle_barycentric_coord.x;
+    isect->v = intersection.triangle_barycentric_coord.y;
+  }
+  else {
+    isect->u = payload.u;
+    isect->v = payload.v;
+  }
+
+  return isect->type != PRIMITIVE_NONE;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
+                                                ccl_private const Ray *ray,
+                                                ccl_private LocalIntersection *local_isect,
+                                                int local_object,
+                                                ccl_private uint *lcg_state,
+                                                int max_hits)
+{
+  if (!intersection_ray_valid(ray)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionLocalPayload payload;
+  payload.self = ray->self;
+  payload.local_object = local_object;
+  payload.max_hits = max_hits;
+  payload.local_isect.num_hits = 0;
+  if (lcg_state) {
+    payload.has_lcg_state = true;
+    payload.lcg_state = *lcg_state;
+  }
+  payload.result = false;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+#  if defined(__METALRT_MOTION__)
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+#  endif
+
+  if (lcg_state) {
+    *lcg_state = payload.lcg_state;
+  }
+  *local_isect = payload.local_isect;
+
+  return payload.result;
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
+                                                     ccl_private const Ray *ray,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     ccl_private uint *num_recorded_hits,
+                                                     ccl_private float *throughput)
+{
+  if (!intersection_ray_valid(ray)) {
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_shadow)) {
+    kernel_assert(!"Invalid ift_shadow");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionShadowPayload payload;
+  payload.self = ray->self;
+  payload.visibility = visibility;
+  payload.max_hits = max_hits;
+  payload.num_hits = 0;
+  payload.num_recorded_hits = 0;
+  payload.throughput = 1.0f;
+  payload.result = false;
+  payload.state = state;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  typename metalrt_intersector_type::result_type intersection;
+
+#  if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_shadow,
+                                             payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_shadow, payload);
+#  endif
+
+  *num_recorded_hits = payload.num_recorded_hits;
+  *throughput = payload.throughput;
+
+  return payload.result;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint visibility)
+{
+  if (!intersection_ray_valid(ray)) {
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
+    kernel_assert(!"Invalid ift_default");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
+  payload.visibility = visibility;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+#  if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_default,
+                                             payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
+#  endif
+
+  if (intersection.type == intersection_type::none) {
+    return false;
+  }
+
+  isect->prim = payload.prim;
+  isect->type = payload.type;
+  isect->object = intersection.user_instance_id;
+
+  isect->t = intersection.distance;
+  if (intersection.type == intersection_type::triangle) {
+    isect->u = intersection.triangle_barycentric_coord.x;
+    isect->v = intersection.triangle_barycentric_coord.y;
+  }
+  else {
+    isect->u = payload.u;
+    isect->v = payload.v;
+  }
+
+  return isect->type != PRIMITIVE_NONE;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 0ed52074a90..130a9ebafae 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -29,11 +29,12 @@ using namespace metal::raytracing;
 
 /* Qualifiers */
 
-#if defined(__KERNEL_METAL_APPLE__)
+/* Inline everything for Apple GPUs. This gives ~1.1x speedup and 10% spill
+ * reduction for integator_shade_surface. However it comes at the cost of
+ * longer compile times (~4.5 minutes on M1 Max) and is disabled for that
+ * reason, until there is a user option to manually enable it. */
 
-/* Inline everything for Apple GPUs.
- * This gives ~1.1x speedup and 10% spill reduction for integator_shade_surface
- * at the cost of longer compile times (~4.5 minutes on M1 Max). */
+#if 0  // defined(__KERNEL_METAL_APPLE__)
 
 #  define ccl_device __attribute__((always_inline))
 #  define ccl_device_inline __attribute__((always_inline))
@@ -45,8 +46,11 @@ using namespace metal::raytracing;
 #  define ccl_device
 #  define ccl_device_inline ccl_device
 #  define ccl_device_forceinline ccl_device
-#  define ccl_device_noinline ccl_device __attribute__((noinline))
-
+#  if defined(__KERNEL_METAL_APPLE__)
+#    define ccl_device_noinline ccl_device
+#  else
+#    define ccl_device_noinline ccl_device __attribute__((noinline))
+#  endif
 #endif
 
 #define ccl_device_noinline_cpu ccl_device
@@ -189,35 +193,46 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
   } volume_write_lambda_pass{kg, this, state};
 
 /* make_type definitions with Metal style element initializers */
-#ifdef make_float2
-#  undef make_float2
-#endif
-#ifdef make_float3
-#  undef make_float3
-#endif
-#ifdef make_float4
-#  undef make_float4
-#endif
-#ifdef make_int2
-#  undef make_int2
-#endif
-#ifdef make_int3
-#  undef make_int3
-#endif
-#ifdef make_int4
-#  undef make_int4
-#endif
-#ifdef make_uchar4
-#  undef make_uchar4
-#endif
-
-#define make_float2(x, y) float2(x, y)
-#define make_float3(x, y, z) float3(x, y, z)
-#define make_float4(x, y, z, w) float4(x, y, z, w)
-#define make_int2(x, y) int2(x, y)
-#define make_int3(x, y, z) int3(x, y, z)
-#define make_int4(x, y, z, w) int4(x, y, z, w)
-#define make_uchar4(x, y, z, w) uchar4(x, y, z, w)
+ccl_device_forceinline float2 make_float2(const float x, const float y)
+{
+  return float2(x, y);
+}
+
+ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
+{
+  return float3(x, y, z);
+}
+
+ccl_device_forceinline float4 make_float4(const float x,
+                                          const float y,
+                                          const float z,
+                                          const float w)
+{
+  return float4(x, y, z, w);
+}
+
+ccl_device_forceinline int2 make_int2(const int x, const int y)
+{
+  return int2(x, y);
+}
+
+ccl_device_forceinline int3 make_int3(const int x, const int y, const int z)
+{
+  return int3(x, y, z);
+}
+
+ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
+{
+  return int4(x, y, z, w);
+}
+
+ccl_device_forceinline uchar4 make_uchar4(const uchar x,
+                                          const uchar y,
+                                          const uchar z,
+                                          const uchar w)
+{
+  return uchar4(x, y, z, w);
+}
 
 /* Math functions */
 
@@ -260,8 +275,6 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
 
 #ifdef __METALRT__
 
-#  define __KERNEL_GPU_RAYTRACING__
-
 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
 #  else
diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h
index b4c8661c401..44ac0478266 100644
--- a/intern/cycles/kernel/device/metal/context_end.h
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -7,4 +7,4 @@
 /* NOTE: These macros will need maintaining as entry-points change. */
 
 #undef kernel_integrator_state
-#define kernel_integrator_state context.launch_params_metal.__integrator_state
+#define kernel_integrator_state context.launch_params_metal.integrator_state
diff --git a/intern/cycles/kernel/device/metal/function_constants.h b/intern/cycles/kernel/device/metal/function_constants.h
new file mode 100644
index 00000000000..3adf390c7f6
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/function_constants.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+enum {
+  Kernel_DummyConstant,
+#define KERNEL_STRUCT_MEMBER(parent, type, name) KernelData_##parent##_##name,
+#include "kernel/data_template.h"
+};
+
+#ifdef __KERNEL_METAL__
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) \
+    constant type kernel_data_##parent##_##name \
+        [[function_constant(KernelData_##parent##_##name)]];
+#  include "kernel/data_template.h"
+#endif
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
index 1c3e775dbae..a336c096440 100644
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -12,11 +12,11 @@ CCL_NAMESPACE_BEGIN
 
 typedef struct KernelParamsMetal {
 
-#define KERNEL_TEX(type, name) ccl_global const type *name;
-#include "kernel/textures.h"
-#undef KERNEL_TEX
+#define KERNEL_DATA_ARRAY(type, name) ccl_global const type *name;
+#include "kernel/data_arrays.h"
+#undef KERNEL_DATA_ARRAY
 
-  const IntegratorStateGPU __integrator_state;
+  const IntegratorStateGPU integrator_state;
   const KernelData data;
 
 } KernelParamsMetal;
@@ -27,12 +27,10 @@ typedef struct KernelGlobalsGPU {
 
 typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
+/* Abstraction macros */
 #define kernel_data launch_params_metal.data
-#define kernel_integrator_state launch_params_metal.__integrator_state
-
-/* data lookup defines */
-
-#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
-#define kernel_tex_array(tex) launch_params_metal.tex
+#define kernel_data_fetch(name, index) launch_params_metal.name[index]
+#define kernel_data_array(name) launch_params_metal.name
+#define kernel_integrator_state launch_params_metal.integrator_state
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
index a7252570e64..5646c7446db 100644
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -1,40 +1,44 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2021-2022 Blender Foundation */
 
-/* Metal kernel entry points */
+/* Metal kernel entry points. */
 
 #include "kernel/device/metal/compat.h"
 #include "kernel/device/metal/globals.h"
+#include "kernel/device/metal/function_constants.h"
 #include "kernel/device/gpu/kernel.h"
 
-/* MetalRT intersection handlers */
+/* MetalRT intersection handlers. */
+
 #ifdef __METALRT__
 
-/* Return type for a bounding box intersection function. */
-struct BoundingBoxIntersectionResult
-{
+/* Intersection return types. */
+
+/* For a bounding box intersection function. */
+struct BoundingBoxIntersectionResult {
   bool accept [[accept_intersection]];
   bool continue_search [[continue_search]];
   float distance [[distance]];
 };
 
-/* Return type for a triangle intersection function. */
-struct TriangleIntersectionResult
-{
+/* For a triangle intersection function. */
+struct TriangleIntersectionResult {
   bool accept [[accept_intersection]];
-  bool continue_search  [[continue_search]];
+  bool continue_search [[continue_search]];
 };
 
 enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX };
 
-ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self,
+/* Utilities. */
+
+ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives &self,
                                               const int object,
                                               const int prim)
 {
   return (self.prim == prim) && (self.object == object);
 }
 
-ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self,
+ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives &self,
                                                      const int object,
                                                      const int prim)
 {
@@ -42,12 +46,14 @@ ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimi
          ((self.light_prim == prim) && (self.light_object == object));
 }
 
-ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self,
+ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives &self,
                                                     const int prim)
 {
   return (self.prim == prim);
 }
 
+/* Hit functions. */
+
 template<typename TReturn, uint intersection_type>
 TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
                           ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload,
@@ -57,9 +63,9 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
                           const float ray_tmax)
 {
   TReturn result;
-  
+
 #ifdef __BVH_LOCAL__
-  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
 
   if ((object != payload.local_object) || intersection_skip_self_local(payload.self, prim)) {
     /* Only intersect with matching object and skip self-intersecton. */
@@ -100,7 +106,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
   }
   else {
     if (payload.local_isect.num_hits && ray_tmax > payload.local_isect.hits[0].t) {
-      /* Record closest intersection only. Do not terminate ray here, since there is no guarantee about distance ordering in any-hit */
+      /* Record closest intersection only. Do not terminate ray here, since there is no guarantee
+       * about distance ordering in any-hit */
       result.accept = false;
       result.continue_search = true;
       return result;
@@ -113,16 +120,16 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
   isect->t = ray_tmax;
   isect->prim = prim;
   isect->object = object;
-  isect->type = kernel_tex_fetch(__objects, object).primitive_type;
+  isect->type = kernel_data_fetch(objects, object).primitive_type;
 
-  isect->u = 1.0f - barycentrics.y - barycentrics.x;
-  isect->v = barycentrics.x;
+  isect->u = barycentrics.x;
+  isect->v = barycentrics.y;
 
   /* Record geometric normal */
-  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect->prim).w;
-  const float3 tri_a = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
-  const float3 tri_b = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
-  const float3 tri_c = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, isect->prim).w;
+  const float3 tri_a = float3(kernel_data_fetch(tri_verts, tri_vindex + 0));
+  const float3 tri_b = float3(kernel_data_fetch(tri_verts, tri_vindex + 1));
+  const float3 tri_c = float3(kernel_data_fetch(tri_verts, tri_vindex + 2));
   payload.local_isect.Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
 
   /* Continue tracing (without this the trace call would return after the first hit) */
@@ -132,21 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }
 
-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                       ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
-                                       uint instance_id [[user_instance_id]],
-                                       uint primitive_id [[primitive_id]],
-                                       float2 barycentrics [[barycentric_coord]],
-                                       float ray_tmax [[distance]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint instance_id [[user_instance_id]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
 {
   return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
+      launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
 {
   /* unused function */
@@ -168,30 +174,21 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
 #ifdef __SHADOW_RECORD_ALL__
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = payload.visibility;
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
     /* continue search */
     return true;
   }
 #  endif
 
-  if (intersection_skip_self_shadow(payload.self, object, prim)) {
-    /* continue search */
-    return true;
-  }
-
-  float u = 0.0f, v = 0.0f;
+  const float u = barycentrics.x;
+  const float v = barycentrics.y;
   int type = 0;
   if (intersection_type == METALRT_HIT_TRIANGLE) {
-    u = 1.0f - barycentrics.y - barycentrics.x;
-    v = barycentrics.x;
-    type = kernel_tex_fetch(__objects, object).primitive_type;
+    type = kernel_data_fetch(objects, object).primitive_type;
   }
 #  ifdef __HAIR__
   else {
-    u = barycentrics.x;
-    v = barycentrics.y;
-    
-    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
     type = segment.type;
     prim = segment.prim;
 
@@ -203,6 +200,11 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   }
 #  endif
 
+  if (intersection_skip_self_shadow(payload.self, object, prim)) {
+    /* continue search */
+    return true;
+  }
+
 #  ifndef __TRANSPARENT_SHADOWS__
   /* No transparent shadows support compiled in, make opaque. */
   payload.result = true;
@@ -214,7 +216,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   short num_recorded_hits = payload.num_recorded_hits;
 
   MetalKernelContext context(launch_params_metal);
-  
+
   /* If no transparent shadows, all light is blocked and we can stop immediately. */
   if (num_hits >= max_hits ||
       !(context.intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
@@ -222,7 +224,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
     /* terminate ray */
     return false;
   }
-  
+
   /* Always use baked shadow transparency for curves. */
   if (type & PRIMITIVE_CURVE) {
     float throughput = payload.throughput;
@@ -239,10 +241,10 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
       return true;
     }
   }
-  
+
   payload.num_hits += 1;
   payload.num_recorded_hits += 1;
-  
+
   uint record_index = num_recorded_hits;
 
   const IntegratorShadowState state = payload.state;
@@ -277,7 +279,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
   INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
-  
+
   /* Continue tracing. */
 #  endif /* __TRANSPARENT_SHADOWS__ */
 #endif   /* __SHADOW_RECORD_ALL__ */
@@ -285,26 +287,25 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
   return true;
 }
 
-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                            unsigned int object [[user_instance_id]],
-                                            unsigned int primitive_id [[primitive_id]],
-                                            float2 barycentrics [[barycentric_coord]],
-                                            float ray_tmax [[distance]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_shadow_all_hit_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    unsigned int object [[user_instance_id]],
+    unsigned int primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
 {
-  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
 
   TriangleIntersectionResult result;
   result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
+      launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
   result.accept = !result.continue_search;
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
 {
   /* unused function */
@@ -316,15 +317,16 @@ __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]
 }
 
 template<typename TReturnType, uint intersection_type>
-inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_params_metal,
-                                           ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                           const uint object,
-                                           const uint prim,
-                                           const float u)
+inline TReturnType metalrt_visibility_test(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    uint prim,
+    const float u)
 {
   TReturnType result;
-    
-#  ifdef __HAIR__
+
+#ifdef __HAIR__
   if (intersection_type == METALRT_HIT_BOUNDING_BOX) {
     /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
@@ -333,15 +335,23 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
       return result;
     }
   }
-#  endif
+#endif
 
   uint visibility = payload.visibility;
-#  ifdef __VISIBILITY_FLAG__
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+#ifdef __VISIBILITY_FLAG__
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
     result.accept = false;
     result.continue_search = true;
     return result;
   }
+#endif
+
+  if (intersection_type == METALRT_HIT_TRIANGLE) {
+  }
+#  ifdef __HAIR__
+  else {
+    prim = kernel_data_fetch(curve_segments, prim).prim;
+  }
 #  endif
 
   /* Shadow ray early termination. */
@@ -370,25 +380,25 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
   return result;
 }
 
-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
-                                             unsigned int object [[user_instance_id]],
-                                             unsigned int primitive_id [[primitive_id]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_visibility_test_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+    unsigned int object [[user_instance_id]],
+    unsigned int primitive_id [[primitive_id]])
 {
-  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
-  TriangleIntersectionResult result = metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, object, prim, 0.0f);
+  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  TriangleIntersectionResult result =
+      metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+          launch_params_metal, payload, object, prim, 0.0f);
   if (result.accept) {
     payload.prim = prim;
-    payload.type = kernel_tex_fetch(__objects, object).primitive_type;
+    payload.type = kernel_data_fetch(objects, object).primitive_type;
   }
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
 {
   /* Unused function */
@@ -399,45 +409,39 @@ __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance
   return result;
 }
 
+/* Primitive intersection functions. */
+
 #ifdef __HAIR__
-ccl_device_inline
-void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
-                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                const uint object,
-                                const uint prim,
-                                const uint type,
-                                const float3 ray_origin,
-                                const float3 ray_direction,
-                                float time,
-                                const float ray_tmax,
-                                thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_curve(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = payload.visibility;
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
     return;
   }
 #  endif
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
-                  launch_params_metal, payload, object, prim, isect.u);
+        launch_params_metal, payload, object, prim, isect.u);
     if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
       payload.u = isect.u;
       payload.v = isect.v;
       payload.prim = prim;
@@ -446,57 +450,46 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
   }
 }
 
-ccl_device_inline
-void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params_metal,
-                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
-                                       const uint object,
-                                       const uint prim,
-                                       const uint type,
-                                       const float3 ray_origin,
-                                       const float3 ray_direction,
-                                       float time,
-                                       const float ray_tmax,
-                                       thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_curve_shadow(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
   const uint visibility = payload.visibility;
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
-                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+        launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
     result.accept = !result.continue_search;
-
-    if (result.accept) {
-      result.distance = isect.t / len;
-    }
   }
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload
+                             [[payload]],
                              const uint object [[user_instance_id]],
                              const uint primitive_id [[primitive_id]],
-                             const float3 ray_origin [[origin]],
-                             const float3 ray_direction [[direction]],
+                             const float3 ray_P [[origin]],
+                             const float3 ray_D [[direction]],
+                             const float ray_tmin [[min_distance]],
                              const float ray_tmax [[max_distance]])
 {
-  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
-  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
 
   BoundingBoxIntersectionResult result;
   result.accept = false;
@@ -504,30 +497,39 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b
   result.distance = ray_tmax;
 
   if (segment.type & PRIMITIVE_CURVE_RIBBON) {
-    metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+    metalrt_intersection_curve(launch_params_metal,
+                               payload,
+                               object,
+                               segment.prim,
+                               segment.type,
+                               ray_P,
+                               ray_D,
 #  if defined(__METALRT_MOTION__)
                                payload.time,
 #  else
                                0.0f,
 #  endif
-                               ray_tmax, result);
+                               ray_tmin,
+                               ray_tmax,
+                               result);
   }
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
-__intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                    const uint object [[user_instance_id]],
-                                    const uint primitive_id [[primitive_id]],
-                                    const float3 ray_origin [[origin]],
-                                    const float3 ray_direction [[direction]],
-                                    const float ray_tmax [[max_distance]])
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
+__intersection__curve_ribbon_shadow(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    const uint object [[user_instance_id]],
+    const uint primitive_id [[primitive_id]],
+    const float3 ray_P [[origin]],
+    const float3 ray_D [[direction]],
+    const float ray_tmin [[min_distance]],
+    const float ray_tmax [[max_distance]])
 {
-  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
-  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
 
   BoundingBoxIntersectionResult result;
   result.accept = false;
@@ -535,115 +537,133 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me
   result.distance = ray_tmax;
 
   if (segment.type & PRIMITIVE_CURVE_RIBBON) {
-    metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+    metalrt_intersection_curve_shadow(launch_params_metal,
+                                      payload,
+                                      object,
+                                      segment.prim,
+                                      segment.type,
+                                      ray_P,
+                                      ray_D,
 #  if defined(__METALRT_MOTION__)
-                               payload.time,
+                                      payload.time,
 #  else
-                               0.0f,
+                                      0.0f,
 #  endif
-                               ray_tmax, result);
+                                      ray_tmin,
+                                      ray_tmax,
+                                      result);
   }
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                          ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                          ray_data MetalKernelContext::MetalRTIntersectionPayload &payload
+                          [[payload]],
                           const uint object [[user_instance_id]],
                           const uint primitive_id [[primitive_id]],
-                          const float3 ray_origin [[origin]],
-                          const float3 ray_direction [[direction]],
+                          const float3 ray_P [[origin]],
+                          const float3 ray_D [[direction]],
+                          const float ray_tmin [[min_distance]],
                           const float ray_tmax [[max_distance]])
 {
-  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
-  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
-    
+  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
+
   BoundingBoxIntersectionResult result;
   result.accept = false;
   result.continue_search = true;
   result.distance = ray_tmax;
-  metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+  metalrt_intersection_curve(launch_params_metal,
+                             payload,
+                             object,
+                             segment.prim,
+                             segment.type,
+                             ray_P,
+                             ray_D,
 #  if defined(__METALRT_MOTION__)
                              payload.time,
 #  else
                              0.0f,
 #  endif
-                             ray_tmax, result);
+                             ray_tmin,
+                             ray_tmax,
+                             result);
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
-__intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                 ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                 const uint object [[user_instance_id]],
-                                 const uint primitive_id [[primitive_id]],
-                                 const float3 ray_origin [[origin]],
-                                 const float3 ray_direction [[direction]],
-                                 const float ray_tmax [[max_distance]])
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
+__intersection__curve_all_shadow(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    const uint object [[user_instance_id]],
+    const uint primitive_id [[primitive_id]],
+    const float3 ray_P [[origin]],
+    const float3 ray_D [[direction]],
+    const float ray_tmin [[min_distance]],
+    const float ray_tmax [[max_distance]])
 {
-  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
-  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
 
   BoundingBoxIntersectionResult result;
   result.accept = false;
   result.continue_search = true;
   result.distance = ray_tmax;
 
-  metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+  metalrt_intersection_curve_shadow(launch_params_metal,
+                                    payload,
+                                    object,
+                                    segment.prim,
+                                    segment.type,
+                                    ray_P,
+                                    ray_D,
 #  if defined(__METALRT_MOTION__)
-                             payload.time,
+                                    payload.time,
 #  else
-                             0.0f,
+                                    0.0f,
 #  endif
-                             ray_tmax, result);
+                                    ray_tmin,
+                                    ray_tmax,
+                                    result);
 
   return result;
 }
 #endif /* __HAIR__ */
 
 #ifdef __POINTCLOUD__
-ccl_device_inline
-void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
-                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                const uint object,
-                                const uint prim,
-                                const uint type,
-                                const float3 ray_origin,
-                                const float3 ray_direction,
-                                float time,
-                                const float ray_tmax,
-                                thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_point(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = payload.visibility;
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
     return;
   }
 #  endif
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the point intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
-                  launch_params_metal, payload, object, prim, isect.u);
+        launch_params_metal, payload, object, prim, isect.u);
     if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
       payload.u = isect.u;
       payload.v = isect.v;
       payload.prim = prim;
@@ -652,99 +672,108 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
   }
 }
 
-ccl_device_inline
-void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal,
-                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
-                                       const uint object,
-                                       const uint prim,
-                                       const uint type,
-                                       const float3 ray_origin,
-                                       const float3 ray_direction,
-                                       float time,
-                                       const float ray_tmax,
-                                       thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_point_shadow(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
   const uint visibility = payload.visibility;
 
-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the point intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
   Intersection isect;
   isect.t = ray_tmax;
-  /* Transform maximum distance into object space */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
 
   MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
     result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
-                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+        launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
     result.accept = !result.continue_search;
 
     if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
     }
   }
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
-                             const uint object [[user_instance_id]],
-                             const uint primitive_id [[primitive_id]],
-                             const float3 ray_origin [[origin]],
-                             const float3 ray_direction [[direction]],
-                             const float ray_tmax [[max_distance]])
+                      ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                      const uint object [[user_instance_id]],
+                      const uint primitive_id [[primitive_id]],
+                      const float3 ray_origin [[origin]],
+                      const float3 ray_direction [[direction]],
+                      const float ray_tmin [[min_distance]],
+                      const float ray_tmax [[max_distance]])
 {
-  const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
-  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+  const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  const int type = kernel_data_fetch(objects, object).primitive_type;
 
   BoundingBoxIntersectionResult result;
   result.accept = false;
   result.continue_search = true;
   result.distance = ray_tmax;
 
-  metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+  metalrt_intersection_point(launch_params_metal,
+                             payload,
+                             object,
+                             prim,
+                             type,
+                             ray_origin,
+                             ray_direction,
 #  if defined(__METALRT_MOTION__)
                              payload.time,
 #  else
                              0.0f,
 #  endif
-                             ray_tmax, result);
+                             ray_tmin,
+                             ray_tmax,
+                             result);
 
   return result;
 }
 
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                    const uint object [[user_instance_id]],
-                                    const uint primitive_id [[primitive_id]],
-                                    const float3 ray_origin [[origin]],
-                                    const float3 ray_direction [[direction]],
-                                    const float ray_tmax [[max_distance]])
+                             ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload
+                             [[payload]],
+                             const uint object [[user_instance_id]],
+                             const uint primitive_id [[primitive_id]],
+                             const float3 ray_origin [[origin]],
+                             const float3 ray_direction [[direction]],
+                             const float ray_tmin [[min_distance]],
+                             const float ray_tmax [[max_distance]])
 {
-  const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
-  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+  const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  const int type = kernel_data_fetch(objects, object).primitive_type;
 
   BoundingBoxIntersectionResult result;
   result.accept = false;
   result.continue_search = true;
   result.distance = ray_tmax;
 
-  metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+  metalrt_intersection_point_shadow(launch_params_metal,
+                                    payload,
+                                    object,
+                                    prim,
+                                    type,
+                                    ray_origin,
+                                    ray_direction,
 #  if defined(__METALRT_MOTION__)
-                             payload.time,
+                                    payload.time,
 #  else
-                             0.0f,
+                                    0.0f,
 #  endif
-                             ray_tmax, result);
+                                    ray_tmin,
+                                    ray_tmax,
+                                    result);
 
   return result;
 }
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
new file mode 100644
index 00000000000..5c49674f247
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_ONEAPI__
+
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#include <cstdint>
+
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
+/* This one does not have an abstraction.
+ * It's used by other devices directly.
+ */
+
+#define __device__
+
+/* Qualifier wrappers for different names on different devices */
+
+#define ccl_device
+#define ccl_global
+#define ccl_always_inline __attribute__((always_inline))
+#define ccl_device_inline inline
+#define ccl_noinline __attribute__((noinline))
+#define ccl_inline_constant const constexpr
+#define ccl_static_constant const
+#define ccl_device_forceinline __attribute__((always_inline))
+#define ccl_device_noinline ccl_device ccl_noinline
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_optional_struct_init
+#define ccl_private
+#define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#define ccl_constant const
+#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#define ccl_align(n) __attribute__((aligned(n)))
+#define kernel_assert(cond)
+#define ccl_may_alias
+
+/* clang-format off */
+
+/* kernel.h adapters */
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+#  define KG_ND_ITEMS \
+  kg->nd_item_local_id_0 = item.get_local_id(0); \
+  kg->nd_item_local_range_0 = item.get_local_range(0); \
+  kg->nd_item_group_0 = item.get_group(0); \
+  kg->nd_item_group_range_0 = item.get_group_range(0); \
+  kg->nd_item_global_id_0 = item.get_global_id(0); \
+  kg->nd_item_global_range_0 = item.get_global_range(0);
+#else
+# define KG_ND_ITEMS
+#endif
+
+#define ccl_gpu_kernel_signature(name, ...) \
+void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
+                          size_t kernel_global_size, \
+                          size_t kernel_local_size, \
+                          sycl::handler &cgh, \
+                          __VA_ARGS__) { \
+      (kg); \
+      cgh.parallel_for<class kernel_##name>( \
+          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
+          [=](sycl::nd_item<1> item) { \
+            KG_ND_ITEMS
+
+#define ccl_gpu_kernel_postfix \
+          }); \
+    }
+
+#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
+
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \
+    ccl_private const ONEAPIKernelContext *kg; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
+
+/* GPU thread, block, grid size and index */
+#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
+#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
+#else
+#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
+#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
+#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
+#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
+#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
+#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
+#endif
+
+
+/* GPU warp synchronization */
+
+#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
+#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
+#ifdef __SYCL_DEVICE_ONLY__
+  #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
+#else
+  #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
+#endif
+
+/* Debug defines */
+#if defined(__SYCL_DEVICE_ONLY__)
+#  define CONSTANT __attribute__((opencl_constant))
+#else
+#  define CONSTANT
+#endif
+
+#define sycl_printf(format, ...) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
+  }
+
+#define sycl_printf_(format) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt);                  \
+  }
+
+/* GPU texture objects */
+
+/* clang-format on */
+
+/* Types */
+
+/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
+ * because these types have different interfaces from blender version. */
+
+using uchar = unsigned char;
+using sycl::half;
+
+/* math functions */
+#define fabsf(x) sycl::fabs((x))
+#define copysignf(x, y) sycl::copysign((x), (y))
+#define asinf(x) sycl::asin((x))
+#define acosf(x) sycl::acos((x))
+#define atanf(x) sycl::atan((x))
+#define floorf(x) sycl::floor((x))
+#define ceilf(x) sycl::ceil((x))
+#define sinhf(x) sycl::sinh((x))
+#define coshf(x) sycl::cosh((x))
+#define tanhf(x) sycl::tanh((x))
+#define hypotf(x, y) sycl::hypot((x), (y))
+#define atan2f(x, y) sycl::atan2((x), (y))
+#define fmaxf(x, y) sycl::fmax((x), (y))
+#define fminf(x, y) sycl::fmin((x), (y))
+#define fmodf(x, y) sycl::fmod((x), (y))
+#define lgammaf(x) sycl::lgamma((x))
+
+#define __forceinline __attribute__((always_inline))
+
+/* Types */
+#include "util/half.h"
+#include "util/types.h"
+
+/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
+ * include oneAPI headers, which transitively include math.h headers which will cause redefinitions
+ * of the math defines because math.h also uses them and having them defined before math.h include
+ * is actually UB. */
+/* Use fast math functions - get them from sycl::native namespace for native math function
+ * implementations */
+#define cosf(x) sycl::native::cos(((float)(x)))
+#define sinf(x) sycl::native::sin(((float)(x)))
+#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
+#define tanf(x) sycl::native::tan(((float)(x)))
+#define logf(x) sycl::native::log(((float)(x)))
+#define expf(x) sycl::native::exp(((float)(x)))
diff --git a/intern/cycles/kernel/device/oneapi/context_begin.h b/intern/cycles/kernel/device/oneapi/context_begin.h
new file mode 100644
index 00000000000..6d6f8cec4ca
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_begin.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_NANOVDB
+#  include <nanovdb/NanoVDB.h>
+#  include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+/* clang-format off */
+struct ONEAPIKernelContext : public KernelGlobalsGPU {
+  public:
+#    include "kernel/device/oneapi/image.h"
+  /* clang-format on */
diff --git a/intern/cycles/kernel/device/oneapi/context_end.h b/intern/cycles/kernel/device/oneapi/context_end.h
new file mode 100644
index 00000000000..ddf0d1f1712
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_end.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+}
+; /* end of ONEAPIKernelContext class definition */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state (*(kg->integrator_state))
diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
new file mode 100644
index 00000000000..5dd0d4203a4
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Intel Corporation */
+
+/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
+DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
+DLL_INTERFACE_CALL(oneapi_free, void, void *)
+DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
+
+DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
+DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
+
+DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
+DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
+DLL_INTERFACE_CALL(
+    oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
+DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
+DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
+
+DLL_INTERFACE_CALL(
+    oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
+DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_usm_memset,
+                   bool,
+                   SyclQueue *queue,
+                   void *usm_ptr,
+                   unsigned char value,
+                   size_t num_bytes)
+
+DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
+
+/* Operation with Kernel globals structure - map of global/constant allocation - filled before
+ * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
+ * pointer. */
+DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
+DLL_INTERFACE_CALL(oneapi_set_global_memory,
+                   void,
+                   SyclQueue *queue,
+                   void *kernel_globals,
+                   const char *memory_name,
+                   void *memory_device_pointer)
+
+DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
+                   size_t,
+                   SyclQueue *queue,
+                   const DeviceKernel kernel,
+                   const size_t kernel_global_size)
+DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
+                   bool,
+                   KernelContext *context,
+                   int kernel,
+                   size_t global_size,
+                   void **args)
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
new file mode 100644
index 00000000000..d60f4f135ba
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#include "kernel/integrator/state.h"
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* NOTE(@nsirgien): With SYCL we can't declare __constant__ global variable, which will be
+ * accessible from device code, like it has been done for Cycles CUDA backend. So, the backend will
+ * allocate this "constant" memory regions and store pointers to them in oneAPI context class */
+
+struct IntegratorStateGPU;
+struct IntegratorQueueCounter;
+
+typedef struct KernelGlobalsGPU {
+
+#define KERNEL_DATA_ARRAY(type, name) const type *__##name = nullptr;
+#include "kernel/data_arrays.h"
+#undef KERNEL_DATA_ARRAY
+  IntegratorStateGPU *integrator_state;
+  const KernelData *__data;
+#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  size_t nd_item_local_id_0;
+  size_t nd_item_local_range_0;
+  size_t nd_item_group_0;
+  size_t nd_item_group_range_0;
+
+  size_t nd_item_global_id_0;
+  size_t nd_item_global_range_0;
+#endif
+} KernelGlobalsGPU;
+
+typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data (*(__data))
+#define kernel_integrator_state (*(integrator_state))
+
+/* data lookup defines */
+
+#define kernel_data_fetch(name, index) __##name[index]
+#define kernel_data_array(name) __##name
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h
new file mode 100644
index 00000000000..2417b8eac3b
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/image.h
@@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+CCL_NAMESPACE_BEGIN
+
+/* For oneAPI implementation we do manual lookup and interpolation. */
+/* TODO: share implementation with ../cpu/image.h. */
+
+template<typename T> ccl_device_forceinline T tex_fetch(const TextureInfo &info, int index)
+{
+  return reinterpret_cast<ccl_global T *>(info.data)[index];
+}
+
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+  x %= width;
+  if (x < 0)
+    x += width;
+  return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+  return clamp(x, 0, width - 1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(const TextureInfo &info, int x, int y, int z)
+{
+  const int data_offset = x + info.width * y + info.width * info.height * z;
+  const int texture_type = info.data_type;
+
+  /* Float4 */
+  if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
+    return tex_fetch<float4>(info, data_offset);
+  }
+  /* Byte4 */
+  else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
+    uchar4 r = tex_fetch<uchar4>(info, data_offset);
+    float f = 1.0f / 255.0f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+  /* Ushort4 */
+  else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
+    ushort4 r = tex_fetch<ushort4>(info, data_offset);
+    float f = 1.0f / 65535.f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+  /* Float */
+  else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
+    float f = tex_fetch<float>(info, data_offset);
+    return make_float4(f, f, f, 1.0f);
+  }
+  /* UShort */
+  else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
+    ushort r = tex_fetch<ushort>(info, data_offset);
+    float f = r * (1.0f / 65535.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+  else if (texture_type == IMAGE_DATA_TYPE_HALF) {
+    float f = tex_fetch<half>(info, data_offset);
+    return make_float4(f, f, f, 1.0f);
+  }
+  else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
+    half4 r = tex_fetch<half4>(info, data_offset);
+    return make_float4(r.x, r.y, r.z, r.w);
+  }
+  /* Byte */
+  else {
+    uchar r = tex_fetch<uchar>(info, data_offset);
+    float f = r * (1.0f / 255.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+}
+
+ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  /* Wrap */
+  if (info.extension == EXTENSION_REPEAT) {
+    x = svm_image_texture_wrap_periodic(x, info.width);
+    y = svm_image_texture_wrap_periodic(y, info.height);
+  }
+  else if (info.extension == EXTENSION_EXTEND) {
+    x = svm_image_texture_wrap_clamp(x, info.width);
+    y = svm_image_texture_wrap_clamp(y, info.height);
+  }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
+
+  return svm_image_texture_read(info, x, y, 0);
+}
+
+ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  /* Wrap */
+  if (info.extension == EXTENSION_REPEAT) {
+    x = svm_image_texture_wrap_periodic(x, info.width);
+    y = svm_image_texture_wrap_periodic(y, info.height);
+    z = svm_image_texture_wrap_periodic(z, info.depth);
+  }
+  else if (info.extension == EXTENSION_EXTEND) {
+    x = svm_image_texture_wrap_clamp(x, info.width);
+    y = svm_image_texture_wrap_clamp(y, info.height);
+    z = svm_image_texture_wrap_clamp(z, info.depth);
+  }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
+
+  return svm_image_texture_read(info, x, y, z);
+}
+
+static float svm_image_texture_frac(float x, int *ix)
+{
+  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+  *ix = i;
+  return x - (float)i;
+}
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+  { \
+    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+    u[3] = (1.0f / 6.0f) * t * t * t; \
+  } \
+  (void)0
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float y)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  if (info.interpolation == INTERPOLATION_CLOSEST) {
+    /* Closest interpolation. */
+    int ix, iy;
+    svm_image_texture_frac(x * info.width, &ix);
+    svm_image_texture_frac(y * info.height, &iy);
+
+    return svm_image_texture_read_2d(id, ix, iy);
+  }
+  else if (info.interpolation == INTERPOLATION_LINEAR) {
+    /* Bilinear interpolation. */
+    int ix, iy;
+    float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
+
+    float4 r;
+    r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy);
+    r += (1.0f - ty) * tx * svm_image_texture_read_2d(id, ix + 1, iy);
+    r += ty * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy + 1);
+    r += ty * tx * svm_image_texture_read_2d(id, ix + 1, iy + 1);
+    return r;
+  }
+  else {
+    /* Bicubic interpolation. */
+    int ix, iy;
+    float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
+
+    float u[4], v[4];
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    for (int y = 0; y < 4; y++) {
+      for (int x = 0; x < 4; x++) {
+        float weight = u[x] * v[y];
+        r += weight * svm_image_texture_read_2d(id, ix + x - 1, iy + y - 1);
+      }
+    }
+    return r;
+  }
+}
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+  static ccl_always_inline float4 read(float r)
+  {
+    return make_float4(r, r, r, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(nanovdb::Vec3f r)
+  {
+    return make_float4(r[0], r[1], r[2], 1.0f);
+  }
+
+  static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+                                                    float x,
+                                                    float y,
+                                                    float z)
+  {
+    const nanovdb::Vec3f xyz(x, y, z);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+                                                   float x,
+                                                   float y,
+                                                   float z)
+  {
+    const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+  }
+
+  static float4 interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+  {
+    int ix, iy, iz;
+    int nix, niy, niz;
+    int pix, piy, piz;
+    int nnix, nniy, nniz;
+    /* Tri-cubic b-spline interpolation. */
+    const float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    const float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    const float tz = svm_image_texture_frac(z - 0.5f, &iz);
+    pix = ix - 1;
+    piy = iy - 1;
+    piz = iz - 1;
+    nix = ix + 1;
+    niy = iy + 1;
+    niz = iz + 1;
+    nnix = ix + 2;
+    nniy = iy + 2;
+    nniz = iz + 2;
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    const int zc[4] = {piz, iz, niz, nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#  define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+#  define COL_TERM(col, row) \
+    (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+               u[3] * DATA(3, col, row)))
+#  define ROW_TERM(row) \
+    (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#  undef COL_TERM
+#  undef ROW_TERM
+#  undef DATA
+  }
+
+  static ccl_always_inline float4
+  interp_3d(const TextureInfo &info, float x, float y, float z, int interp)
+  {
+    using namespace nanovdb;
+
+    NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+    AccessorType acc = grid->getAccessor();
+
+    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+      case INTERPOLATION_CLOSEST:
+        return interp_3d_closest(acc, x, y, z);
+      case INTERPOLATION_LINEAR:
+        return interp_3d_linear(acc, x, y, z);
+      default:
+        return interp_3d_cubic(acc, x, y, z);
+    }
+  }
+};
+#endif /* WITH_NANOVDB */
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, int interp)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  if (info.use_transform_3d) {
+    Transform tfm = info.transform_3d;
+    P = transform_point(&tfm, P);
+  }
+
+  float x = P.x;
+  float y = P.y;
+  float z = P.z;
+
+  uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
+
+#ifdef WITH_NANOVDB
+  if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
+    return NanoVDBInterpolator<float>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN) {
+    return NanoVDBInterpolator<nanovdb::FpN>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
+    return NanoVDBInterpolator<nanovdb::Fp16>::interp_3d(info, x, y, z, interpolation);
+  }
+#else
+  if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
+    return make_float4(
+        TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+#endif
+  else {
+    x *= info.width;
+    y *= info.height;
+    z *= info.depth;
+  }
+
+  if (interpolation == INTERPOLATION_CLOSEST) {
+    /* Closest interpolation. */
+    int ix, iy, iz;
+    svm_image_texture_frac(x, &ix);
+    svm_image_texture_frac(y, &iy);
+    svm_image_texture_frac(z, &iz);
+
+    return svm_image_texture_read_3d(id, ix, iy, iz);
+  }
+  else if (interpolation == INTERPOLATION_LINEAR) {
+    /* Trilinear interpolation. */
+    int ix, iy, iz;
+    float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    float tz = svm_image_texture_frac(z - 0.5f, &iz);
+
+    float4 r;
+    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz);
+    r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz);
+    r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz);
+    r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz);
+
+    r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz + 1);
+    r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz + 1);
+    r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz + 1);
+    r += tz * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz + 1);
+    return r;
+  }
+  else {
+    /* Tri-cubic interpolation. */
+    int ix, iy, iz;
+    float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    float tz = svm_image_texture_frac(z - 0.5f, &iz);
+
+    float u[4], v[4], w[4];
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    for (int z = 0; z < 4; z++) {
+      for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+          float weight = u[x] * v[y] * w[z];
+          r += weight * svm_image_texture_read_3d(id, ix + x - 1, iy + y - 1, iz + z - 1);
+        }
+      }
+    }
+    return r;
+  }
+}
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
new file mode 100644
index 00000000000..097d21b963f
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -0,0 +1,929 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+/* clang-format off */
+#  include "kernel.h"
+#  include <iostream>
+#  include <map>
+#  include <set>
+
+#  include <CL/sycl.hpp>
+
+#  include "kernel/device/oneapi/compat.h"
+#  include "kernel/device/oneapi/globals.h"
+#  include "kernel/device/oneapi/kernel_templates.h"
+
+#  include "kernel/device/gpu/kernel.h"
+/* clang-format on */
+
+static OneAPIErrorCallback s_error_cb = nullptr;
+static void *s_error_user_ptr = nullptr;
+
+static std::vector<sycl::device> oneapi_available_devices();
+
+void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
+{
+  s_error_cb = cb;
+  s_error_user_ptr = user_ptr;
+}
+
+void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
+{
+#  ifdef _DEBUG
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  sycl::info::device_type device_type =
+      queue->get_device().get_info<sycl::info::device::device_type>();
+  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
+  (void)usm_type;
+  assert(usm_type == sycl::usm::alloc::device ||
+         ((device_type == sycl::info::device_type::host ||
+           device_type == sycl::info::device_type::is_cpu || allow_host) &&
+          usm_type == sycl::usm::alloc::host));
+#  endif
+}
+
+bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
+{
+  bool finished_correct = true;
+  try {
+    std::vector<sycl::device> devices = oneapi_available_devices();
+    if (device_index < 0 || device_index >= devices.size()) {
+      return false;
+    }
+    sycl::queue *created_queue = new sycl::queue(devices[device_index],
+                                                 sycl::property::queue::in_order());
+    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
+  }
+  catch (sycl::exception const &e) {
+    finished_correct = false;
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+  }
+  return finished_correct;
+}
+
+void oneapi_free_queue(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  delete queue;
+}
+
+void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
+}
+
+void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::malloc_device(memory_size, *queue);
+}
+
+void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, usm_ptr, true);
+  sycl::free(usm_ptr, *queue);
+}
+
+bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, dest, true);
+  oneapi_check_usm(queue_, src, true);
+  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  else
+  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
+  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
+  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
+                             src_type == sycl::usm::alloc::device;
+  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
+                                          src_type == sycl::usm::alloc::unknown;
+  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
+   * may not wait until the end of the transfer before using the memory.
+   */
+  if (from_device_to_host || host_or_device_memop_with_offset)
+    mem_event.wait();
+  return true;
+#  endif
+}
+
+bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, usm_ptr, true);
+  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  else
+  (void)mem_event;
+  return true;
+#  endif
+}
+
+bool oneapi_queue_synchronize(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  try {
+    queue->wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+}
+
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
+ * also trigger runtime compilation of all existing oneAPI kernels */
+bool oneapi_run_test_kernel(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  size_t N = 8;
+  sycl::buffer<float, 1> A(N);
+  sycl::buffer<float, 1> B(N);
+
+  {
+    sycl::host_accessor A_host_acc(A, sycl::write_only);
+    for (size_t i = (size_t)0; i < N; i++)
+      A_host_acc[i] = rand() % 32;
+  }
+
+  try {
+    queue->submit([&](sycl::handler &cgh) {
+      sycl::accessor A_acc(A, cgh, sycl::read_only);
+      sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
+
+      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+    });
+    queue->wait_and_throw();
+
+    sycl::host_accessor A_host_acc(A, sycl::read_only);
+    sycl::host_accessor B_host_acc(B, sycl::read_only);
+
+    for (size_t i = (size_t)0; i < N; i++) {
+      float result = A_host_acc[i] + B_host_acc[i];
+      (void)result;
+    }
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
+{
+  kernel_global_size = sizeof(KernelGlobalsGPU);
+
+  return true;
+}
+
+void oneapi_set_global_memory(SyclQueue *queue_,
+                              void *kernel_globals,
+                              const char *memory_name,
+                              void *memory_device_pointer)
+{
+  assert(queue_);
+  assert(kernel_globals);
+  assert(memory_name);
+  assert(memory_device_pointer);
+  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
+  oneapi_check_usm(queue_, memory_device_pointer);
+  oneapi_check_usm(queue_, kernel_globals, true);
+
+  std::string matched_name(memory_name);
+
+/* This macro will change global ptr of KernelGlobals via name matching. */
+#  define KERNEL_DATA_ARRAY(type, name) \
+    else if (#name == matched_name) \
+    { \
+      globals->__##name = (type *)memory_device_pointer; \
+      return; \
+    }
+  if (false) {
+  }
+  else if ("integrator_state" == matched_name) {
+    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
+    return;
+  }
+  KERNEL_DATA_ARRAY(KernelData, data)
+#  include "kernel/data_arrays.h"
+  else
+  {
+    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
+              << std::endl;
+    assert(false);
+  }
+#  undef KERNEL_DATA_ARRAY
+}
+
+/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
+/* TODO: Move below function to oneapi/queue.cpp. */
+size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
+                                          const DeviceKernel kernel,
+                                          const size_t kernel_global_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  (void)kernel_global_size;
+  const static size_t preferred_work_group_size_intersect_shading = 32;
+  const static size_t preferred_work_group_size_technical = 1024;
+
+  size_t preferred_work_group_size = 0;
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      preferred_work_group_size = preferred_work_group_size_intersect_shading;
+      break;
+
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      preferred_work_group_size = preferred_work_group_size_technical;
+      break;
+
+    default:
+      preferred_work_group_size = 512;
+  }
+
+  const size_t limit_work_group_size =
+      queue->get_device().get_info<sycl::info::device::max_work_group_size>();
+  return std::min(limit_work_group_size, preferred_work_group_size);
+}
+
+bool oneapi_enqueue_kernel(KernelContext *kernel_context,
+                           int kernel,
+                           size_t global_size,
+                           void **args)
+{
+  bool success = true;
+  ::DeviceKernel device_kernel = (::DeviceKernel)kernel;
+  KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals;
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(kernel_context->queue);
+  assert(queue);
+  if (!queue) {
+    return false;
+  }
+
+  size_t local_size = oneapi_kernel_preferred_local_size(
+      kernel_context->queue, device_kernel, global_size);
+  assert(global_size % local_size == 0);
+
+  /* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we
+   * overwrite it outside of oneapi_kernel_preferred_local_size. */
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) {
+    local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
+  }
+
+  /* Kernels listed below need a specific number of work groups. */
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) {
+    int num_states = *((int *)(args[0]));
+    /* Round up to the next work-group. */
+    size_t groups_count = (num_states + local_size - 1) / local_size;
+    /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
+     * we extend work size to fit uniformity requirements. */
+    global_size = groups_count * local_size;
+
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+    if (queue->get_device().is_host()) {
+      global_size = 1;
+      local_size = 1;
+    }
+#  endif
+  }
+
+  /* Let the compiler throw an error if there are any kernels missing in this implementation. */
+#  if defined(_WIN32)
+#    pragma warning(error : 4062)
+#  elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic error "-Wswitch"
+#  endif
+
+  try {
+    queue->submit([&](sycl::handler &cgh) {
+      switch (device_kernel) {
+        case DEVICE_KERNEL_INTEGRATOR_RESET: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_intersect_subsurface);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_intersect_volume_stack);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_shade_surface_raytrace);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_queued_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_terminated_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_terminated_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_adaptive_sampling_convergence_check);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_shader_eval_curve_shadow_transparency);
+          break;
+        }
+        case DEVICE_KERNEL_PREFIX_SUM: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum);
+          break;
+        }
+
+        /* clang-format off */
+    #  define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
+    case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \
+      oneapi_call(kg, cgh, \
+                            global_size, \
+                            local_size, \
+                            args, \
+                            oneapi_kernel_film_convert_##variant); \
+      break; \
+     }
+
+#  define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \
+      DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
+      DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba)
+
+      DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH);
+      DEVICE_KERNEL_FILM_CONVERT(mist, MIST);
+      DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
+      DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
+      DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
+      DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
+      DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
+      DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);
+      DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER);
+      DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow,
+                                 SHADOW_CATCHER_MATTE_WITH_SHADOW);
+      DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED);
+      DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4);
+
+#  undef DEVICE_KERNEL_FILM_CONVERT
+#  undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL
+          /* clang-format on */
+
+        case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_filter_guiding_set_fake_albedo);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess);
+          break;
+        }
+        case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_shadow_states);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_shadow_catcher_count_possible_splits);
+          break;
+        }
+        /* Unsupported kernels */
+        case DEVICE_KERNEL_NUM:
+        case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+          kernel_assert(0);
+          break;
+      }
+    });
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+      success = false;
+    }
+  }
+
+#  if defined(_WIN32)
+#    pragma warning(default : 4062)
+#  elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#  endif
+  return success;
+}
+
+/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
+ * since Windows driver 101.3268. */
+/* The same min compute-runtime version is currently required across Windows and Linux.
+ * For Windows driver 101.3268, compute-runtime version is 23570. */
+static const int lowest_supported_driver_version_win = 1013268;
+static const int lowest_supported_driver_version_neo = 23570;
+
+static int parse_driver_build_version(const sycl::device &device)
+{
+  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
+  int driver_build_version = 0;
+
+  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
+  if (second_dot_position == std::string::npos) {
+    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+              << " xx.xx.xxx.xxxx (Windows) for device \""
+              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+  }
+  else {
+    try {
+      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
+      if (third_dot_position != std::string::npos) {
+        const std::string &third_number_substr = driver_version.substr(
+            second_dot_position + 1, third_dot_position - second_dot_position - 1);
+        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
+        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
+          driver_build_version = std::stoi(third_number_substr) * 10000 +
+                                 std::stoi(forth_number_substr);
+      }
+      else {
+        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
+        driver_build_version = std::stoi(third_number_substr);
+      }
+    }
+    catch (std::invalid_argument &e) {
+      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+                << " xx.xx.xxx.xxxx (Windows) for device \""
+                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+    }
+  }
+
+  return driver_build_version;
+}
+
+static std::vector<sycl::device> oneapi_available_devices()
+{
+  bool allow_all_devices = false;
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+    allow_all_devices = true;
+
+    /* Host device is useful only for debugging at the moment
+     * so we hide this device with default build settings. */
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  bool allow_host = true;
+#  else
+  bool allow_host = false;
+#  endif
+
+  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
+
+  std::vector<sycl::device> available_devices;
+  for (const sycl::platform &platform : oneapi_platforms) {
+    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
+     */
+    if (platform.get_backend() == sycl::backend::opencl) {
+      continue;
+    }
+
+    const std::vector<sycl::device> &oneapi_devices =
+        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
+                                            platform.get_devices(sycl::info::device_type::gpu);
+
+    for (const sycl::device &device : oneapi_devices) {
+      if (allow_all_devices) {
+        /* still filter out host device if build doesn't support it. */
+        if (allow_host || !device.is_host()) {
+          available_devices.push_back(device);
+        }
+      }
+      else {
+        bool filter_out = false;
+
+        /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
+         * assuming they have either more than 96 Execution Units or not 7 threads per EU.
+         * Official support can be broaden to older and smaller GPUs once ready. */
+        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
+          /* Filtered-out defaults in-case these values aren't available through too old L0
+           * runtime. */
+          int number_of_eus = 96;
+          int threads_per_eu = 7;
+          if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+          }
+          if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+            threads_per_eu =
+                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+          }
+          /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
+          if (number_of_eus <= 96 && threads_per_eu == 7) {
+            filter_out = true;
+          }
+          /* if not already filtered out, check driver version. */
+          if (!filter_out) {
+            int driver_build_version = parse_driver_build_version(device);
+            if ((driver_build_version > 100000 &&
+                 driver_build_version < lowest_supported_driver_version_win) ||
+                driver_build_version < lowest_supported_driver_version_neo) {
+              filter_out = true;
+            }
+          }
+        }
+        else if (!allow_host && device.is_host()) {
+          filter_out = true;
+        }
+        else if (!allow_all_devices) {
+          filter_out = true;
+        }
+
+        if (!filter_out) {
+          available_devices.push_back(device);
+        }
+      }
+    }
+  }
+
+  return available_devices;
+}
+
+char *oneapi_device_capabilities()
+{
+  std::stringstream capabilities;
+
+  const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
+  for (const sycl::device &device : oneapi_devices) {
+    const std::string &name = device.get_info<sycl::info::device::name>();
+
+    capabilities << std::string("\t") << name << "\n";
+#  define WRITE_ATTR(attribute_name, attribute_variable) \
+    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
+                 << "\n";
+#  define GET_NUM_ATTR(attribute) \
+    { \
+      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
+      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
+    }
+
+    GET_NUM_ATTR(vendor_id)
+    GET_NUM_ATTR(max_compute_units)
+    GET_NUM_ATTR(max_work_item_dimensions)
+
+    sycl::id<3> max_work_item_sizes =
+        device.get_info<sycl::info::device::max_work_item_sizes<3>>();
+    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
+    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
+    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
+
+    GET_NUM_ATTR(max_work_group_size)
+    GET_NUM_ATTR(max_num_sub_groups)
+    GET_NUM_ATTR(sub_group_independent_forward_progress)
+
+    GET_NUM_ATTR(preferred_vector_width_char)
+    GET_NUM_ATTR(preferred_vector_width_short)
+    GET_NUM_ATTR(preferred_vector_width_int)
+    GET_NUM_ATTR(preferred_vector_width_long)
+    GET_NUM_ATTR(preferred_vector_width_float)
+    GET_NUM_ATTR(preferred_vector_width_double)
+    GET_NUM_ATTR(preferred_vector_width_half)
+
+    GET_NUM_ATTR(native_vector_width_char)
+    GET_NUM_ATTR(native_vector_width_short)
+    GET_NUM_ATTR(native_vector_width_int)
+    GET_NUM_ATTR(native_vector_width_long)
+    GET_NUM_ATTR(native_vector_width_float)
+    GET_NUM_ATTR(native_vector_width_double)
+    GET_NUM_ATTR(native_vector_width_half)
+
+    size_t max_clock_frequency =
+        (size_t)(device.is_host() ? (size_t)0 :
+                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
+
+    GET_NUM_ATTR(address_bits)
+    GET_NUM_ATTR(max_mem_alloc_size)
+
+    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
+     * supported so we always return false, even if device supports HW texture usage acceleration.
+     */
+    bool image_support = false;
+    WRITE_ATTR("image_support", (size_t)image_support)
+
+    GET_NUM_ATTR(max_parameter_size)
+    GET_NUM_ATTR(mem_base_addr_align)
+    GET_NUM_ATTR(global_mem_size)
+    GET_NUM_ATTR(local_mem_size)
+    GET_NUM_ATTR(error_correction_support)
+    GET_NUM_ATTR(profiling_timer_resolution)
+    GET_NUM_ATTR(is_available)
+
+#  undef GET_NUM_ATTR
+#  undef WRITE_ATTR
+    capabilities << "\n";
+  }
+
+  return ::strdup(capabilities.str().c_str());
+}
+
+void oneapi_free(void *p)
+{
+  if (p) {
+    ::free(p);
+  }
+}
+
+void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
+{
+  int num = 0;
+  std::vector<sycl::device> devices = oneapi_available_devices();
+  for (sycl::device &device : devices) {
+    const std::string &platform_name =
+        device.get_platform().get_info<sycl::info::platform::name>();
+    std::string name = device.get_info<sycl::info::device::name>();
+    std::string id = "ONEAPI_" + platform_name + "_" + name;
+    if (device.has(sycl::aspect::ext_intel_pci_address)) {
+      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+    }
+    (cb)(id.c_str(), name.c_str(), num, user_ptr);
+    num++;
+  }
+}
+
+size_t oneapi_get_memcapacity(SyclQueue *queue)
+{
+  return reinterpret_cast<sycl::queue *>(queue)
+      ->get_device()
+      .get_info<sycl::info::device::global_mem_size>();
+}
+
+int oneapi_get_num_multiprocessors(SyclQueue *queue)
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+  }
+  else
+    return 0;
+}
+
+int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
+      device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
+           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+  }
+  else
+    return 0;
+}
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
new file mode 100644
index 00000000000..c5f853742ed
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#ifdef WITH_ONEAPI
+
+#  include <stddef.h>
+
+/* NOTE(@nsirgien): Should match underlying type in the declaration inside "kernel/types.h"
+ * TODO: use kernel/types.h directly. */
+enum DeviceKernel : int;
+
+#  ifndef CYCLES_KERNEL_ONEAPI_EXPORT
+#    ifdef _WIN32
+#      if defined(ONEAPI_EXPORT)
+#        define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllexport)
+#      else
+#        define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllimport)
+#      endif
+#    else
+#      define CYCLES_KERNEL_ONEAPI_EXPORT
+#    endif
+#  endif
+
+class SyclQueue;
+
+typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
+                                             const char *name,
+                                             int num,
+                                             void *user_ptr);
+
+typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
+
+struct KernelContext {
+  /* Queue, associated with selected device */
+  SyclQueue *queue;
+  /* Pointer to USM device memory with all global/constant allocation on this device */
+  void *kernel_globals;
+};
+
+/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime.
+ */
+#  ifdef __cplusplus
+extern "C" {
+#  endif
+
+#  define DLL_INTERFACE_CALL(function, return_type, ...) \
+    CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
+#  include "kernel/device/oneapi/dll_interface_template.h"
+#  undef DLL_INTERFACE_CALL
+
+#  ifdef __cplusplus
+}
+#  endif
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h
new file mode 100644
index 00000000000..0ae925cf748
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+/* Some macro magic to generate templates for kernel arguments.
+ * The resulting oneapi_call() template allows to call a SYCL/C++ kernel
+ * with typed arguments by only giving it a void `**args` as given by Cycles.
+ * The template will automatically cast from void* to the expected type. */
+
+/* When expanded by the preprocessor, the generated templates will look like this example: */
+#if 0
+template<typename T0, typename T1, typename T2>
+void oneapi_call(
+    KernelGlobalsGPU *kg,
+    sycl::handler &cgh,
+    size_t global_size,
+    size_t local_size,
+    void **args,
+    void (*func)(const KernelGlobalsGPU *, size_t, size_t, sycl::handler &, T0, T1, T2))
+{
+  func(kg, global_size, local_size, cgh, *(T0 *)(args[0]), *(T1 *)(args[1]), *(T2 *)(args[2]));
+}
+#endif
+
+/* clang-format off */
+#define ONEAPI_TYP(x) typename T##x
+#define ONEAPI_CAST(x) *(T##x *)(args[x])
+#define ONEAPI_T(x) T##x
+
+#define ONEAPI_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, N,  ...) N
+#define ONEAPI_0(_call, ...)
+#define ONEAPI_1(_call, x) _call(x)
+#define ONEAPI_2(_call, x, ...) _call(x), ONEAPI_1(_call, __VA_ARGS__)
+#define ONEAPI_3(_call, x, ...) _call(x), ONEAPI_2(_call, __VA_ARGS__)
+#define ONEAPI_4(_call, x, ...) _call(x), ONEAPI_3(_call, __VA_ARGS__)
+#define ONEAPI_5(_call, x, ...) _call(x), ONEAPI_4(_call, __VA_ARGS__)
+#define ONEAPI_6(_call, x, ...) _call(x), ONEAPI_5(_call, __VA_ARGS__)
+#define ONEAPI_7(_call, x, ...) _call(x), ONEAPI_6(_call, __VA_ARGS__)
+#define ONEAPI_8(_call, x, ...) _call(x), ONEAPI_7(_call, __VA_ARGS__)
+#define ONEAPI_9(_call, x, ...) _call(x), ONEAPI_8(_call, __VA_ARGS__)
+#define ONEAPI_10(_call, x, ...) _call(x), ONEAPI_9(_call, __VA_ARGS__)
+#define ONEAPI_11(_call, x, ...) _call(x), ONEAPI_10(_call, __VA_ARGS__)
+#define ONEAPI_12(_call, x, ...) _call(x), ONEAPI_11(_call, __VA_ARGS__)
+#define ONEAPI_13(_call, x, ...) _call(x), ONEAPI_12(_call, __VA_ARGS__)
+#define ONEAPI_14(_call, x, ...) _call(x), ONEAPI_13(_call, __VA_ARGS__)
+#define ONEAPI_15(_call, x, ...) _call(x), ONEAPI_14(_call, __VA_ARGS__)
+#define ONEAPI_16(_call, x, ...) _call(x), ONEAPI_15(_call, __VA_ARGS__)
+#define ONEAPI_17(_call, x, ...) _call(x), ONEAPI_16(_call, __VA_ARGS__)
+#define ONEAPI_18(_call, x, ...) _call(x), ONEAPI_17(_call, __VA_ARGS__)
+#define ONEAPI_19(_call, x, ...) _call(x), ONEAPI_18(_call, __VA_ARGS__)
+#define ONEAPI_20(_call, x, ...) _call(x), ONEAPI_19(_call, __VA_ARGS__)
+#define ONEAPI_21(_call, x, ...) _call(x), ONEAPI_20(_call, __VA_ARGS__)
+
+#define ONEAPI_CALL_FOR(x, ...) \
+  ONEAPI_GET_NTH_ARG("ignored", \
+                     ##__VA_ARGS__, \
+                     ONEAPI_21, \
+                     ONEAPI_20, \
+                     ONEAPI_19, \
+                     ONEAPI_18, \
+                     ONEAPI_17, \
+                     ONEAPI_16, \
+                     ONEAPI_15, \
+                     ONEAPI_14, \
+                     ONEAPI_13, \
+                     ONEAPI_12, \
+                     ONEAPI_11, \
+                     ONEAPI_10, \
+                     ONEAPI_9, \
+                     ONEAPI_8, \
+                     ONEAPI_7, \
+                     ONEAPI_6, \
+                     ONEAPI_5, \
+                     ONEAPI_4, \
+                     ONEAPI_3, \
+                     ONEAPI_2, \
+                     ONEAPI_1, \
+                     ONEAPI_0) \
+  (x, ##__VA_ARGS__)
+
+/* This template automatically casts entries in the void **args array to the types requested by the kernel func.
+ * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
+#define oneapi_template(...) \
+  template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \
+  void oneapi_call( \
+      KernelGlobalsGPU *kg, \
+      sycl::handler &cgh, \
+      size_t global_size, \
+      size_t local_size, \
+      void **args, \
+      void (*func)(KernelGlobalsGPU*, size_t, size_t, sycl::handler &, ONEAPI_CALL_FOR(ONEAPI_T, __VA_ARGS__))) \
+  { \
+        func(kg, \
+             global_size, \
+             local_size, \
+             cgh, \
+             ONEAPI_CALL_FOR(ONEAPI_CAST, __VA_ARGS__)); \
+  }
+
+oneapi_template(0)
+oneapi_template(0, 1)
+oneapi_template(0, 1, 2)
+oneapi_template(0, 1, 2, 3)
+oneapi_template(0, 1, 2, 3, 4)
+oneapi_template(0, 1, 2, 3, 4, 5)
+oneapi_template(0, 1, 2, 3, 4, 5, 6)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
+
+    /* clang-format on */
diff --git a/intern/cycles/kernel/device/optix/bvh.h b/intern/cycles/kernel/device/optix/bvh.h
new file mode 100644
index 00000000000..fb9907709ce
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/bvh.h
@@ -0,0 +1,659 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* OptiX implementation of ray-scene intersection. */
+
+#pragma once
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+
+#define OPTIX_DEFINE_ABI_VERSION_ONLY
+#include <optix_function_table.h>
+
+CCL_NAMESPACE_BEGIN
+
+/* Utilities. */
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
+{
+  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
+}
+template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
+{
+  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
+}
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
+{
+  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
+}
+
+ccl_device_forceinline int get_object_id()
+{
+#ifdef __OBJECT_MOTION__
+  /* Always get the instance ID from the TLAS
+   * There might be a motion transform node between TLAS and BLAS which does not have one. */
+  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+#else
+  return optixGetInstanceId();
+#endif
+}
+
+/* Hit/miss functions. */
+
+extern "C" __global__ void __miss__kernel_optix_miss()
+{
+  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
+  optixSetPayload_5(PRIMITIVE_NONE);
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_local_hit()
+{
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves and points. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __BVH_LOCAL__
+  const int object = get_object_id();
+  if (object != optixGetPayload_4() /* local_object */) {
+    /* Only intersect with matching object. */
+    return optixIgnoreIntersection();
+  }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_local(ray->self, prim)) {
+    return optixIgnoreIntersection();
+  }
+
+  const uint max_hits = optixGetPayload_5();
+  if (max_hits == 0) {
+    /* Special case for when no hit information is requested, just report that something was hit */
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  int hit = 0;
+  uint *const lcg_state = get_payload_ptr_0<uint>();
+  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
+
+  if (lcg_state) {
+    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+      if (optixGetRayTmax() == local_isect->hits[i].t) {
+        return optixIgnoreIntersection();
+      }
+    }
+
+    hit = local_isect->num_hits++;
+
+    if (local_isect->num_hits > max_hits) {
+      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
+      if (hit >= max_hits) {
+        return optixIgnoreIntersection();
+      }
+    }
+  }
+  else {
+    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
+      /* Record closest intersection only.
+       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
+       */
+      return optixIgnoreIntersection();
+    }
+
+    local_isect->num_hits = 1;
+  }
+
+  Intersection *isect = &local_isect->hits[hit];
+  isect->t = optixGetRayTmax();
+  isect->prim = prim;
+  isect->object = get_object_id();
+  isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
+
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = barycentrics.x;
+  isect->v = barycentrics.y;
+
+  /* Record geometric normal. */
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+  /* Continue tracing (without this the trace call would return after the first hit). */
+  optixIgnoreIntersection();
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
+{
+#ifdef __SHADOW_RECORD_ALL__
+  int prim = optixGetPrimitiveIndex();
+  const uint object = get_object_id();
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#  endif
+
+  float u = 0.0f, v = 0.0f;
+  int type = 0;
+  if (optixIsTriangleHit()) {
+    /* Triangle. */
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    u = barycentrics.x;
+    v = barycentrics.y;
+    type = kernel_data_fetch(objects, object).primitive_type;
+  }
+#  ifdef __HAIR__
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    /* Curve. */
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
+
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
+    type = segment.type;
+    prim = segment.prim;
+
+#    if OPTIX_ABI_VERSION < 55
+    /* Filter out curve end-caps. */
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+#    endif
+  }
+#  endif
+  else {
+    /* Point. */
+    type = kernel_data_fetch(objects, object).primitive_type;
+    u = 0.0f;
+    v = 0.0f;
+  }
+
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_shadow(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+
+#  ifndef __TRANSPARENT_SHADOWS__
+  /* No transparent shadows support compiled in, make opaque. */
+  optixSetPayload_5(true);
+  return optixTerminateRay();
+#  else
+  const uint max_hits = optixGetPayload_3();
+  const uint num_hits_packed = optixGetPayload_2();
+  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
+
+  /* If no transparent shadows, all light is blocked and we can stop immediately. */
+  if (num_hits >= max_hits ||
+      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  /* Always use baked shadow transparency for curves. */
+  if (type & PRIMITIVE_CURVE) {
+    float throughput = __uint_as_float(optixGetPayload_1());
+    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    optixSetPayload_1(__float_as_uint(throughput));
+    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
+
+    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+      optixSetPayload_5(true);
+      return optixTerminateRay();
+    }
+    else {
+      /* Continue tracing. */
+      optixIgnoreIntersection();
+      return;
+    }
+  }
+
+  /* Record transparent intersection. */
+  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
+
+  uint record_index = num_recorded_hits;
+
+  const IntegratorShadowState state = optixGetPayload_0();
+
+  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  if (record_index >= max_record_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+    uint max_recorded_hit = 0;
+
+    for (int i = 1; i < max_record_hits; i++) {
+      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+      if (isect_t > max_recorded_t) {
+        max_recorded_t = isect_t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
+       * current hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
+
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
+
+  /* Continue tracing. */
+  optixIgnoreIntersection();
+#  endif /* __TRANSPARENT_SHADOWS__ */
+#endif   /* __SHADOW_RECORD_ALL__ */
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_volume_test()
+{
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  const uint object = get_object_id();
+#ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  if ((kernel_data_fetch(object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
+    return optixIgnoreIntersection();
+  }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
+#ifdef __HAIR__
+#  if OPTIX_ABI_VERSION < 55
+  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
+    /* Filter out curve end-caps. */
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
+#  endif
+#endif
+
+  const uint object = get_object_id();
+  const uint visibility = optixGetPayload_4();
+#ifdef __VISIBILITY_FLAG__
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  int prim = optixGetPrimitiveIndex();
+  if (optixIsTriangleHit()) {
+    /* Triangle. */
+  }
+#ifdef __HAIR__
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    /* Curve. */
+    prim = kernel_data_fetch(curve_segments, prim).prim;
+  }
+#endif
+
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    if (intersection_skip_self_shadow(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+    else {
+      /* Shadow ray early termination. */
+      return optixTerminateRay();
+    }
+  }
+  else {
+    if (intersection_skip_self(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+  }
+}
+
+extern "C" __global__ void __closesthit__kernel_optix_hit()
+{
+  const int object = get_object_id();
+  const int prim = optixGetPrimitiveIndex();
+
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
+  optixSetPayload_4(object);
+
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    optixSetPayload_1(__float_as_uint(barycentrics.x));
+    optixSetPayload_2(__float_as_uint(barycentrics.y));
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
+  }
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
+    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
+    optixSetPayload_2(optixGetAttribute_1());
+    optixSetPayload_3(segment.prim);
+    optixSetPayload_5(segment.type);
+  }
+  else {
+    optixSetPayload_1(0);
+    optixSetPayload_2(0);
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
+  }
+}
+
+/* Custom primitive intersection functions. */
+
+#ifdef __HAIR__
+ccl_device_inline void optix_intersection_curve(const int prim, const int type)
+{
+  const int object = get_object_id();
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  const float3 ray_P = optixGetObjectRayOrigin();
+  const float3 ray_D = optixGetObjectRayDirection();
+  const float ray_tmin = optixGetRayTmin();
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+
+  if (curve_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t,
+                            type & PRIMITIVE_ALL,
+                            __float_as_int(isect.u),  /* Attribute_0 */
+                            __float_as_int(isect.v)); /* Attribute_1 */
+  }
+}
+
+extern "C" __global__ void __intersection__curve_ribbon()
+{
+  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, optixGetPrimitiveIndex());
+  const int prim = segment.prim;
+  const int type = segment.type;
+  if (type & PRIMITIVE_CURVE_RIBBON) {
+    optix_intersection_curve(prim, type);
+  }
+}
+
+#endif
+
+#ifdef __POINTCLOUD__
+extern "C" __global__ void __intersection__point()
+{
+  const int prim = optixGetPrimitiveIndex();
+  const int object = get_object_id();
+  const int type = kernel_data_fetch(objects, object).primitive_type;
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  const float3 ray_P = optixGetObjectRayOrigin();
+  const float3 ray_D = optixGetObjectRayDirection();
+  const float ray_tmin = optixGetRayTmin();
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+
+  if (point_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t, type & PRIMITIVE_ALL);
+  }
+}
+#endif
+
+/* Scene intersection. */
+
+ccl_device_intersect bool scene_intersect(KernelGlobals kg,
+                                          ccl_private const Ray *ray,
+                                          const uint visibility,
+                                          ccl_private Intersection *isect)
+{
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  uint ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             ray_flags,
+             0, /* SBT offset for PG_HITD */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
+                                                ccl_private const Ray *ray,
+                                                ccl_private LocalIntersection *local_isect,
+                                                int local_object,
+                                                ccl_private uint *lcg_state,
+                                                int max_hits)
+{
+  uint p0 = pointer_pack_to_uint_0(lcg_state);
+  uint p1 = pointer_pack_to_uint_1(lcg_state);
+  uint p2 = pointer_pack_to_uint_0(local_isect);
+  uint p3 = pointer_pack_to_uint_1(local_isect);
+  uint p4 = local_object;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
+  uint p5 = max_hits;
+
+  if (local_isect) {
+    local_isect->num_hits = 0; /* Initialize hit count to zero. */
+  }
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             0xFF,
+             /* Need to always call into __anyhit__kernel_optix_local_hit. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             2, /* SBT offset for PG_HITL */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  return p5;
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
+                                                     ccl_private const Ray *ray,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     ccl_private uint *num_recorded_hits,
+                                                     ccl_private float *throughput)
+{
+  uint p0 = state;
+  uint p1 = __float_as_uint(1.0f); /* Throughput. */
+  uint p2 = 0;                     /* Number of hits. */
+  uint p3 = max_hits;
+  uint p4 = visibility;
+  uint p5 = false;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             1, /* SBT offset for PG_HITS */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  *num_recorded_hits = uint16_unpack_from_uint_0(p2);
+  *throughput = __uint_as_float(p1);
+
+  return p5;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint visibility)
+{
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_volume_test. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             3, /* SBT offset for PG_HITV */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index aa4a6321a8b..1a11a533b7e 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -8,7 +8,6 @@
 #include <optix.h>
 
 #define __KERNEL_GPU__
-#define __KERNEL_GPU_RAYTRACING__
 #define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
index bb752c531f0..7af2e421378 100644
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -28,21 +28,21 @@ struct KernelParamsOptiX {
 
   /* Global scene data and textures */
   KernelData data;
-#define KERNEL_TEX(type, name) const type *name;
-#include "kernel/textures.h"
+#define KERNEL_DATA_ARRAY(type, name) const type *name;
+#include "kernel/data_arrays.h"
 
   /* Integrator state */
-  IntegratorStateGPU __integrator_state;
+  IntegratorStateGPU integrator_state;
 };
 
 #ifdef __NVCC__
-extern "C" static __constant__ KernelParamsOptiX __params;
+extern "C" static __constant__ KernelParamsOptiX kernel_params;
 #endif
 
 /* Abstraction macros */
-#define kernel_data __params.data
-#define kernel_tex_array(t) __params.t
-#define kernel_tex_fetch(t, index) __params.t[(index)]
-#define kernel_integrator_state __params.__integrator_state
+#define kernel_data kernel_params.data
+#define kernel_data_array(name) kernel_params.name
+#define kernel_data_fetch(name, index) kernel_params.name[(index)]
+#define kernel_integrator_state kernel_params.integrator_state
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 9843b2e99be..6abb5aeacb9 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -20,469 +20,39 @@
 #include "kernel/integrator/intersect_volume_stack.h"
 // clang-format on
 
-#define OPTIX_DEFINE_ABI_VERSION_ONLY
-#include <optix_function_table.h>
-
-template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
-{
-  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
-}
-template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
-{
-  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
-}
-
-template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
-{
-  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
-}
-
-ccl_device_forceinline int get_object_id()
-{
-#ifdef __OBJECT_MOTION__
-  /* Always get the instance ID from the TLAS
-   * There might be a motion transform node between TLAS and BLAS which does not have one. */
-  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
-#else
-  return optixGetInstanceId();
-#endif
-}
-
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
   const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
-                                                       global_index;
-  integrator_intersect_closest(nullptr, path_index, __params.render_buffer);
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_intersect_closest(nullptr, path_index, kernel_params.render_buffer);
 }
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
 {
   const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
-                                                       global_index;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
   integrator_intersect_shadow(nullptr, path_index);
 }
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
 {
   const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
-                                                       global_index;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
   integrator_intersect_subsurface(nullptr, path_index);
 }
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
 {
   const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
-                                                       global_index;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
   integrator_intersect_volume_stack(nullptr, path_index);
 }
 
-extern "C" __global__ void __miss__kernel_optix_miss()
-{
-  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
-  optixSetPayload_5(PRIMITIVE_NONE);
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_local_hit()
-{
-#if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (!optixIsTriangleHit()) {
-    /* Ignore curves and points. */
-    return optixIgnoreIntersection();
-  }
-#endif
-
-#ifdef __BVH_LOCAL__
-  const int object = get_object_id();
-  if (object != optixGetPayload_4() /* local_object */) {
-    /* Only intersect with matching object. */
-    return optixIgnoreIntersection();
-  }
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self_local(ray->self, prim)) {
-    return optixIgnoreIntersection();
-  }
-
-  const uint max_hits = optixGetPayload_5();
-  if (max_hits == 0) {
-    /* Special case for when no hit information is requested, just report that something was hit */
-    optixSetPayload_5(true);
-    return optixTerminateRay();
-  }
-
-  int hit = 0;
-  uint *const lcg_state = get_payload_ptr_0<uint>();
-  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
-
-  if (lcg_state) {
-    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
-      if (optixGetRayTmax() == local_isect->hits[i].t) {
-        return optixIgnoreIntersection();
-      }
-    }
-
-    hit = local_isect->num_hits++;
-
-    if (local_isect->num_hits > max_hits) {
-      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
-      if (hit >= max_hits) {
-        return optixIgnoreIntersection();
-      }
-    }
-  }
-  else {
-    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      /* Record closest intersection only.
-       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
-       */
-      return optixIgnoreIntersection();
-    }
-
-    local_isect->num_hits = 1;
-  }
-
-  Intersection *isect = &local_isect->hits[hit];
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = get_object_id();
-  isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type;
-
-  const float2 barycentrics = optixGetTriangleBarycentrics();
-  isect->u = 1.0f - barycentrics.y - barycentrics.x;
-  isect->v = barycentrics.x;
-
-  /* Record geometric normal. */
-  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
-  const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
-  const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
-
-  /* Continue tracing (without this the trace call would return after the first hit). */
-  optixIgnoreIntersection();
-#endif
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
-{
-#ifdef __SHADOW_RECORD_ALL__
-  int prim = optixGetPrimitiveIndex();
-  const uint object = get_object_id();
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#  endif
-
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self_shadow(ray->self, object, prim)) {
-    return optixIgnoreIntersection();
-  }
-
-  float u = 0.0f, v = 0.0f;
-  int type = 0;
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    u = 1.0f - barycentrics.y - barycentrics.x;
-    v = barycentrics.x;
-    type = kernel_tex_fetch(__objects, object).primitive_type;
-  }
-#  ifdef __HAIR__
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
-    u = __uint_as_float(optixGetAttribute_0());
-    v = __uint_as_float(optixGetAttribute_1());
-
-    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
-    type = segment.type;
-    prim = segment.prim;
-
-#    if OPTIX_ABI_VERSION < 55
-    /* Filter out curve endcaps. */
-    if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
-    }
-#    endif
-  }
-#  endif
-  else {
-    type = kernel_tex_fetch(__objects, object).primitive_type;
-    u = 0.0f;
-    v = 0.0f;
-  }
-
-#  ifndef __TRANSPARENT_SHADOWS__
-  /* No transparent shadows support compiled in, make opaque. */
-  optixSetPayload_5(true);
-  return optixTerminateRay();
-#  else
-  const uint max_hits = optixGetPayload_3();
-  const uint num_hits_packed = optixGetPayload_2();
-  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
-  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
-
-  /* If no transparent shadows, all light is blocked and we can stop immediately. */
-  if (num_hits >= max_hits ||
-      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
-    optixSetPayload_5(true);
-    return optixTerminateRay();
-  }
-
-  /* Always use baked shadow transparency for curves. */
-  if (type & PRIMITIVE_CURVE) {
-    float throughput = __uint_as_float(optixGetPayload_1());
-    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
-    optixSetPayload_1(__float_as_uint(throughput));
-    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
-
-    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
-      optixSetPayload_5(true);
-      return optixTerminateRay();
-    }
-    else {
-      /* Continue tracing. */
-      optixIgnoreIntersection();
-      return;
-    }
-  }
-
-  /* Record transparent intersection. */
-  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
-
-  uint record_index = num_recorded_hits;
-
-  const IntegratorShadowState state = optixGetPayload_0();
-
-  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
-  if (record_index >= max_record_hits) {
-    /* If maximum number of hits reached, find a hit to replace. */
-    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
-    uint max_recorded_hit = 0;
-
-    for (int i = 1; i < max_record_hits; i++) {
-      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
-      if (isect_t > max_recorded_t) {
-        max_recorded_t = isect_t;
-        max_recorded_hit = i;
-      }
-    }
-
-    if (optixGetRayTmax() >= max_recorded_t) {
-      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
-       * current hit anymore. */
-      return;
-    }
-
-    record_index = max_recorded_hit;
-  }
-
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
-
-  /* Continue tracing. */
-  optixIgnoreIntersection();
-#  endif /* __TRANSPARENT_SHADOWS__ */
-#endif   /* __SHADOW_RECORD_ALL__ */
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_volume_test()
-{
-#if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (!optixIsTriangleHit()) {
-    /* Ignore curves. */
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  const uint object = get_object_id();
-#ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
-    return optixIgnoreIntersection();
-  }
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self(ray->self, object, prim)) {
-    return optixIgnoreIntersection();
-  }
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
-{
-#ifdef __HAIR__
-#  if OPTIX_ABI_VERSION < 55
-  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
-    /* Filter out curve endcaps. */
-    const float u = __uint_as_float(optixGetAttribute_0());
-    if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
-    }
-  }
-#  endif
-#endif
-
-  const uint object = get_object_id();
-  const uint visibility = optixGetPayload_4();
-#ifdef __VISIBILITY_FLAG__
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-
-  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    if (intersection_skip_self_shadow(ray->self, object, prim)) {
-      return optixIgnoreIntersection();
-    }
-    else {
-      /* Shadow ray early termination. */
-      return optixTerminateRay();
-    }
-  }
-  else {
-    if (intersection_skip_self(ray->self, object, prim)) {
-      return optixIgnoreIntersection();
-    }
-  }
-}
-
-extern "C" __global__ void __closesthit__kernel_optix_hit()
-{
-  const int object = get_object_id();
-  const int prim = optixGetPrimitiveIndex();
-
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
-  optixSetPayload_4(object);
-
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
-    optixSetPayload_2(__float_as_uint(barycentrics.x));
-    optixSetPayload_3(prim);
-    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
-  }
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
-    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
-    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
-    optixSetPayload_2(optixGetAttribute_1());
-    optixSetPayload_3(segment.prim);
-    optixSetPayload_5(segment.type);
-  }
-  else {
-    optixSetPayload_1(0);
-    optixSetPayload_2(0);
-    optixSetPayload_3(prim);
-    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
-  }
-}
-
-#ifdef __HAIR__
-ccl_device_inline void optix_intersection_curve(const int prim, const int type)
-{
-  const int object = get_object_id();
-
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
-    return;
-  }
-#  endif
-
-  float3 P = optixGetObjectRayOrigin();
-  float3 dir = optixGetObjectRayDirection();
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
-#  ifdef __OBJECT_MOTION__
-  const float time = optixGetRayTime();
-#  else
-  const float time = 0.0f;
-#  endif
-
-  Intersection isect;
-  isect.t = optixGetRayTmax();
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
-
-  if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
-    optixReportIntersection(isect.t / len,
-                            type & PRIMITIVE_ALL,
-                            __float_as_int(isect.u),  /* Attribute_0 */
-                            __float_as_int(isect.v)); /* Attribute_1 */
-  }
-}
-
-extern "C" __global__ void __intersection__curve_ribbon()
-{
-  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex());
-  const int prim = segment.prim;
-  const int type = segment.type;
-  if (type & PRIMITIVE_CURVE_RIBBON) {
-    optix_intersection_curve(prim, type);
-  }
-}
-
-#endif
-
-#ifdef __POINTCLOUD__
-extern "C" __global__ void __intersection__point()
-{
-  const int prim = optixGetPrimitiveIndex();
-  const int object = get_object_id();
-  const int type = kernel_tex_fetch(__objects, object).primitive_type;
-
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
-    return;
-  }
-#  endif
-
-  float3 P = optixGetObjectRayOrigin();
-  float3 dir = optixGetObjectRayDirection();
-
-  /* The direction is not normalized by default, the point intersection routine expects that. */
-  float len;
-  dir = normalize_len(dir, &len);
-
-#  ifdef __OBJECT_MOTION__
-  const float time = optixGetRayTime();
-#  else
-  const float time = 0.0f;
-#  endif
-
-  Intersection isect;
-  isect.t = optixGetRayTmax();
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX) {
-    isect.t *= len;
-  }
-
-  if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
-    optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
-  }
-}
-#endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
index 3bd57bc0f1a..41e6224f6da 100644
--- a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -11,15 +11,15 @@
 extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
 {
   const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+  const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] :
                                                        global_index;
-  integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+  integrator_shade_surface_raytrace(nullptr, path_index, kernel_params.render_buffer);
 }
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_mnee()
 {
   const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+  const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] :
                                                        global_index;
-  integrator_shade_surface_mnee(nullptr, path_index, __params.render_buffer);
+  integrator_shade_surface_mnee(nullptr, path_index, kernel_params.render_buffer);
 }
diff --git a/intern/cycles/kernel/film/adaptive_sampling.h b/intern/cycles/kernel/film/adaptive_sampling.h
index 16867c39d99..d28c87747c3 100644
--- a/intern/cycles/kernel/film/adaptive_sampling.h
+++ b/intern/cycles/kernel/film/adaptive_sampling.h
@@ -3,15 +3,15 @@
 
 #pragma once
 
-#include "kernel/film/write_passes.h"
+#include "kernel/film/write.h"
 
 CCL_NAMESPACE_BEGIN
 
 /* Check whether the pixel has converged and should not be sampled anymore. */
 
-ccl_device_forceinline bool kernel_need_sample_pixel(KernelGlobals kg,
-                                                     ConstIntegratorState state,
-                                                     ccl_global float *render_buffer)
+ccl_device_forceinline bool film_need_sample_pixel(KernelGlobals kg,
+                                                   ConstIntegratorState state,
+                                                   ccl_global float *render_buffer)
 {
   if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
     return true;
@@ -28,14 +28,14 @@ ccl_device_forceinline bool kernel_need_sample_pixel(KernelGlobals kg,
 
 /* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
 
-ccl_device bool kernel_adaptive_sampling_convergence_check(KernelGlobals kg,
-                                                           ccl_global float *render_buffer,
-                                                           int x,
-                                                           int y,
-                                                           float threshold,
-                                                           bool reset,
-                                                           int offset,
-                                                           int stride)
+ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg,
+                                                         ccl_global float *render_buffer,
+                                                         int x,
+                                                         int y,
+                                                         float threshold,
+                                                         bool reset,
+                                                         int offset,
+                                                         int stride)
 {
   kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
   kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
@@ -78,13 +78,13 @@ ccl_device bool kernel_adaptive_sampling_convergence_check(KernelGlobals kg,
 /* This is a simple box filter in two passes.
  * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
 
-ccl_device void kernel_adaptive_sampling_filter_x(KernelGlobals kg,
-                                                  ccl_global float *render_buffer,
-                                                  int y,
-                                                  int start_x,
-                                                  int width,
-                                                  int offset,
-                                                  int stride)
+ccl_device void film_adaptive_sampling_filter_x(KernelGlobals kg,
+                                                ccl_global float *render_buffer,
+                                                int y,
+                                                int start_x,
+                                                int width,
+                                                int offset,
+                                                int stride)
 {
   kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
 
@@ -111,13 +111,13 @@ ccl_device void kernel_adaptive_sampling_filter_x(KernelGlobals kg,
   }
 }
 
-ccl_device void kernel_adaptive_sampling_filter_y(KernelGlobals kg,
-                                                  ccl_global float *render_buffer,
-                                                  int x,
-                                                  int start_y,
-                                                  int height,
-                                                  int offset,
-                                                  int stride)
+ccl_device void film_adaptive_sampling_filter_y(KernelGlobals kg,
+                                                ccl_global float *render_buffer,
+                                                int x,
+                                                int start_y,
+                                                int height,
+                                                int offset,
+                                                int stride)
 {
   kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
 
diff --git a/intern/cycles/kernel/film/aov_passes.h b/intern/cycles/kernel/film/aov_passes.h
new file mode 100644
index 00000000000..3fbb250340f
--- /dev/null
+++ b/intern/cycles/kernel/film/aov_passes.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/film/write.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void film_write_aov_pass_value(KernelGlobals kg,
+                                                 ConstIntegratorState state,
+                                                 ccl_global float *ccl_restrict render_buffer,
+                                                 const int aov_id,
+                                                 const float value)
+{
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+  film_write_pass_float(buffer + kernel_data.film.pass_aov_value + aov_id, value);
+}
+
+ccl_device_inline void film_write_aov_pass_color(KernelGlobals kg,
+                                                 ConstIntegratorState state,
+                                                 ccl_global float *ccl_restrict render_buffer,
+                                                 const int aov_id,
+                                                 const float3 color)
+{
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+  film_write_pass_float4(buffer + kernel_data.film.pass_aov_color + aov_id,
+                         make_float4(color.x, color.y, color.z, 1.0f));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/id_passes.h b/intern/cycles/kernel/film/cryptomatte_passes.h
index c8317512bb2..4765777e7e2 100644
--- a/intern/cycles/kernel/film/id_passes.h
+++ b/intern/cycles/kernel/film/cryptomatte_passes.h
@@ -8,15 +8,15 @@ CCL_NAMESPACE_BEGIN
 /* Element of ID pass stored in the render buffers.
  * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
  * render buffers might not meet expected by compiler alignment. */
-typedef struct IDPassBufferElement {
+typedef struct CryptoPassBufferElement {
   float x;
   float y;
-} IDPassBufferElement;
+} CryptoPassBufferElement;
 
-ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
-                                             int num_slots,
-                                             float id,
-                                             float weight)
+ccl_device_inline void film_write_cryptomatte_slots(ccl_global float *buffer,
+                                                    int num_slots,
+                                                    float id,
+                                                    float weight)
 {
   kernel_assert(id != ID_NONE);
   if (weight == 0.0f) {
@@ -24,7 +24,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
   }
 
   for (int slot = 0; slot < num_slots; slot++) {
-    ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
+    ccl_global CryptoPassBufferElement *id_buffer = (ccl_global CryptoPassBufferElement *)buffer;
 #ifdef __ATOMIC_PASS_WRITE__
     /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
     if (id_buffer[slot].x == ID_NONE) {
@@ -60,9 +60,9 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
   }
 }
 
-ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
+ccl_device_inline void film_sort_cryptomatte_slots(ccl_global float *buffer, int num_slots)
 {
-  ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
+  ccl_global CryptoPassBufferElement *id_buffer = (ccl_global CryptoPassBufferElement *)buffer;
   for (int slot = 1; slot < num_slots; ++slot) {
     if (id_buffer[slot].x == ID_NONE) {
       return;
@@ -70,7 +70,7 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
     /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
     int i = slot;
     while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
-      const IDPassBufferElement swap = id_buffer[i];
+      const CryptoPassBufferElement swap = id_buffer[i];
       id_buffer[i] = id_buffer[i - 1];
       id_buffer[i - 1] = swap;
       --i;
@@ -79,15 +79,15 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
 }
 
 /* post-sorting for Cryptomatte */
-ccl_device_inline void kernel_cryptomatte_post(KernelGlobals kg,
-                                               ccl_global float *render_buffer,
-                                               int pixel_index)
+ccl_device_inline void film_cryptomatte_post(KernelGlobals kg,
+                                             ccl_global float *render_buffer,
+                                             int pixel_index)
 {
   const int pass_stride = kernel_data.film.pass_stride;
   const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
   ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
                                          kernel_data.film.pass_cryptomatte;
-  kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+  film_sort_cryptomatte_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/data_passes.h b/intern/cycles/kernel/film/data_passes.h
new file mode 100644
index 00000000000..efdf616749f
--- /dev/null
+++ b/intern/cycles/kernel/film/data_passes.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/camera/camera.h"
+
+#include "kernel/film/cryptomatte_passes.h"
+#include "kernel/film/write.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline size_t film_write_cryptomatte_pass(ccl_global float *ccl_restrict buffer,
+                                                     size_t depth,
+                                                     float id,
+                                                     float matte_weight)
+{
+  film_write_cryptomatte_slots(buffer, depth * 2, id, matte_weight);
+  return depth * 4;
+}
+
+ccl_device_inline void film_write_data_passes(KernelGlobals kg,
+                                              IntegratorState state,
+                                              ccl_private const ShaderData *sd,
+                                              ccl_global float *ccl_restrict render_buffer)
+{
+#ifdef __PASSES__
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+
+  if (!(path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+    return;
+  }
+
+  const int flag = kernel_data.film.pass_flag;
+
+  if (!(flag & PASS_ANY)) {
+    return;
+  }
+
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
+    if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
+        average(surface_shader_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
+      if (INTEGRATOR_STATE(state, path, sample) == 0) {
+        if (flag & PASSMASK(DEPTH)) {
+          const float depth = camera_z_depth(kg, sd->P);
+          film_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
+        }
+        if (flag & PASSMASK(OBJECT_ID)) {
+          const float id = object_pass_id(kg, sd->object);
+          film_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
+        }
+        if (flag & PASSMASK(MATERIAL_ID)) {
+          const float id = shader_pass_id(kg, sd);
+          film_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
+        }
+        if (flag & PASSMASK(POSITION)) {
+          const float3 position = sd->P;
+          film_write_pass_float3(buffer + kernel_data.film.pass_position, position);
+        }
+      }
+
+      if (flag & PASSMASK(NORMAL)) {
+        const float3 normal = surface_shader_average_normal(kg, sd);
+        film_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
+      }
+      if (flag & PASSMASK(ROUGHNESS)) {
+        const float roughness = surface_shader_average_roughness(sd);
+        film_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
+      }
+      if (flag & PASSMASK(UV)) {
+        const float3 uv = primitive_uv(kg, sd);
+        film_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
+      }
+      if (flag & PASSMASK(MOTION)) {
+        const float4 speed = primitive_motion_vector(kg, sd);
+        film_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
+        film_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
+      }
+
+      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
+    }
+  }
+
+  if (kernel_data.film.cryptomatte_passes) {
+    const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+    const float matte_weight = average(throughput) *
+                               (1.0f - average(surface_shader_transparency(kg, sd)));
+    if (matte_weight > 0.0f) {
+      ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+      if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+        const float id = object_cryptomatte_id(kg, sd->object);
+        cryptomatte_buffer += film_write_cryptomatte_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
+      }
+      if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+        const float id = kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).cryptomatte_id;
+        cryptomatte_buffer += film_write_cryptomatte_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
+      }
+      if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+        const float id = object_cryptomatte_asset_id(kg, sd->object);
+        cryptomatte_buffer += film_write_cryptomatte_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
+      }
+    }
+  }
+
+  if (flag & PASSMASK(DIFFUSE_COLOR)) {
+    const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_diffuse_color,
+                             surface_shader_diffuse(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(GLOSSY_COLOR)) {
+    const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_glossy_color,
+                             surface_shader_glossy(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(TRANSMISSION_COLOR)) {
+    const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_transmission_color,
+                             surface_shader_transmission(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(MIST)) {
+    /* Bring depth into 0..1 range. */
+    const float mist_start = kernel_data.film.mist_start;
+    const float mist_inv_depth = kernel_data.film.mist_inv_depth;
+
+    const float depth = camera_distance(kg, sd->P);
+    float mist = saturatef((depth - mist_start) * mist_inv_depth);
+
+    /* Falloff */
+    const float mist_falloff = kernel_data.film.mist_falloff;
+
+    if (mist_falloff == 1.0f)
+      ;
+    else if (mist_falloff == 2.0f)
+      mist = mist * mist;
+    else if (mist_falloff == 0.5f)
+      mist = sqrtf(mist);
+    else
+      mist = powf(mist, mist_falloff);
+
+    /* Modulate by transparency */
+    const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+    const Spectrum alpha = surface_shader_alpha(kg, sd);
+    const float mist_output = (1.0f - mist) * average(throughput * alpha);
+
+    /* Note that the final value in the render buffer we want is 1 - mist_output,
+     * to avoid having to tracking this in the Integrator state we do the negation
+     * after rendering. */
+    film_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
+  }
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/denoising_passes.h b/intern/cycles/kernel/film/denoising_passes.h
new file mode 100644
index 00000000000..dfc21d787f2
--- /dev/null
+++ b/intern/cycles/kernel/film/denoising_passes.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/film/write.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __DENOISING_FEATURES__
+ccl_device_forceinline void film_write_denoising_features_surface(KernelGlobals kg,
+                                                                  IntegratorState state,
+                                                                  ccl_private const ShaderData *sd,
+                                                                  ccl_global float *ccl_restrict
+                                                                      render_buffer)
+{
+  if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_DENOISING_FEATURES)) {
+    return;
+  }
+
+  /* Skip implicitly transparent surfaces. */
+  if (sd->flag & SD_HAS_ONLY_VOLUME) {
+    return;
+  }
+
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  if (kernel_data.film.pass_denoising_depth != PASS_UNUSED) {
+    const Spectrum denoising_feature_throughput = INTEGRATOR_STATE(
+        state, path, denoising_feature_throughput);
+    const float depth = sd->ray_length - INTEGRATOR_STATE(state, ray, tmin);
+    const float denoising_depth = ensure_finite(average(denoising_feature_throughput) * depth);
+    film_write_pass_float(buffer + kernel_data.film.pass_denoising_depth, denoising_depth);
+  }
+
+  float3 normal = zero_float3();
+  Spectrum diffuse_albedo = zero_spectrum();
+  Spectrum specular_albedo = zero_spectrum();
+  float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      continue;
+    }
+
+    /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+    normal += sc->N * sc->sample_weight;
+    sum_weight += sc->sample_weight;
+
+    Spectrum closure_albedo = sc->weight;
+    /* Closures that include a Fresnel term typically have weights close to 1 even though their
+     * actual contribution is significantly lower.
+     * To account for this, we scale their weight by the average fresnel factor (the same is also
+     * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
+    if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
+      ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)sc;
+      closure_albedo *= bsdf->extra->fresnel_color;
+    }
+    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
+      ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)sc;
+      closure_albedo *= bsdf->avg_value;
+    }
+    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
+      closure_albedo *= bsdf_principled_hair_albedo(sc);
+    }
+    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
+      /* BSSRDF already accounts for weight, retro-reflection would double up. */
+      ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)
+          sc;
+      if (bsdf->components == PRINCIPLED_DIFFUSE_RETRO_REFLECTION) {
+        continue;
+      }
+    }
+
+    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
+      diffuse_albedo += closure_albedo;
+      sum_nonspecular_weight += sc->sample_weight;
+    }
+    else {
+      specular_albedo += closure_albedo;
+    }
+  }
+
+  /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+  if ((sum_weight == 0.0f) || (sum_nonspecular_weight * 4.0f > sum_weight)) {
+    if (sum_weight != 0.0f) {
+      normal /= sum_weight;
+    }
+
+    if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+      /* Transform normal into camera space. */
+      const Transform worldtocamera = kernel_data.cam.worldtocamera;
+      normal = transform_direction(&worldtocamera, normal);
+
+      const float3 denoising_normal = ensure_finite(normal);
+      film_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+    }
+
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const Spectrum denoising_feature_throughput = INTEGRATOR_STATE(
+          state, path, denoising_feature_throughput);
+      const Spectrum denoising_albedo = ensure_finite(denoising_feature_throughput *
+                                                      diffuse_albedo);
+      film_write_pass_spectrum(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
+
+    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) *= specular_albedo;
+  }
+}
+
+ccl_device_forceinline void film_write_denoising_features_volume(KernelGlobals kg,
+                                                                 IntegratorState state,
+                                                                 const Spectrum albedo,
+                                                                 const bool scatter,
+                                                                 ccl_global float *ccl_restrict
+                                                                     render_buffer)
+{
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+  const Spectrum denoising_feature_throughput = INTEGRATOR_STATE(
+      state, path, denoising_feature_throughput);
+
+  if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+    /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
+    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+
+    /* Write view direction as normal. */
+    const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
+    film_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+  }
+
+  if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+    /* Write albedo. */
+    const Spectrum denoising_albedo = ensure_finite(denoising_feature_throughput * albedo);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+  }
+}
+#endif /* __DENOISING_FEATURES__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/accumulate.h b/intern/cycles/kernel/film/light_passes.h
index e10acfd7eb5..b45b5305119 100644
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/light_passes.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "kernel/film/adaptive_sampling.h"
-#include "kernel/film/write_passes.h"
+#include "kernel/film/write.h"
 
 #include "kernel/integrator/shadow_catcher.h"
 
@@ -21,10 +21,10 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void bsdf_eval_init(ccl_private BsdfEval *eval,
                                       const ClosureType closure_type,
-                                      float3 value)
+                                      Spectrum value)
 {
-  eval->diffuse = zero_float3();
-  eval->glossy = zero_float3();
+  eval->diffuse = zero_spectrum();
+  eval->glossy = zero_spectrum();
 
   if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
     eval->diffuse = value;
@@ -38,7 +38,7 @@ ccl_device_inline void bsdf_eval_init(ccl_private BsdfEval *eval,
 
 ccl_device_inline void bsdf_eval_accum(ccl_private BsdfEval *eval,
                                        const ClosureType closure_type,
-                                       float3 value)
+                                       Spectrum value)
 {
   if (CLOSURE_IS_BSDF_DIFFUSE(closure_type)) {
     eval->diffuse += value;
@@ -62,30 +62,30 @@ ccl_device_inline void bsdf_eval_mul(ccl_private BsdfEval *eval, float value)
   eval->sum *= value;
 }
 
-ccl_device_inline void bsdf_eval_mul3(ccl_private BsdfEval *eval, float3 value)
+ccl_device_inline void bsdf_eval_mul(ccl_private BsdfEval *eval, Spectrum value)
 {
   eval->diffuse *= value;
   eval->glossy *= value;
   eval->sum *= value;
 }
 
-ccl_device_inline float3 bsdf_eval_sum(ccl_private const BsdfEval *eval)
+ccl_device_inline Spectrum bsdf_eval_sum(ccl_private const BsdfEval *eval)
 {
   return eval->sum;
 }
 
-ccl_device_inline float3 bsdf_eval_pass_diffuse_weight(ccl_private const BsdfEval *eval)
+ccl_device_inline Spectrum bsdf_eval_pass_diffuse_weight(ccl_private const BsdfEval *eval)
 {
   /* Ratio of diffuse weight to recover proportions for writing to render pass.
    * We assume reflection, transmission and volume scatter to be exclusive. */
-  return safe_divide_float3_float3(eval->diffuse, eval->sum);
+  return safe_divide(eval->diffuse, eval->sum);
 }
 
-ccl_device_inline float3 bsdf_eval_pass_glossy_weight(ccl_private const BsdfEval *eval)
+ccl_device_inline Spectrum bsdf_eval_pass_glossy_weight(ccl_private const BsdfEval *eval)
 {
   /* Ratio of glossy weight to recover proportions for writing to render pass.
    * We assume reflection, transmission and volume scatter to be exclusive. */
-  return safe_divide_float3_float3(eval->glossy, eval->sum);
+  return safe_divide(eval->glossy, eval->sum);
 }
 
 /* --------------------------------------------------------------------
@@ -95,17 +95,17 @@ ccl_device_inline float3 bsdf_eval_pass_glossy_weight(ccl_private const BsdfEval
  * to render buffers instead of using per-thread memory, and to avoid the
  * impact of clamping on other contributions. */
 
-ccl_device_forceinline void kernel_accum_clamp(KernelGlobals kg, ccl_private float3 *L, int bounce)
+ccl_device_forceinline void film_clamp_light(KernelGlobals kg, ccl_private Spectrum *L, int bounce)
 {
 #ifdef __KERNEL_DEBUG_NAN__
-  if (!isfinite3_safe(*L)) {
+  if (!isfinite_safe(*L)) {
     kernel_assert(!"Cycles sample with non-finite value detected");
   }
 #endif
   /* Make sure all components are finite, allowing the contribution to be usable by adaptive
    * sampling convergence check, but also to make it so render result never causes issues with
    * post-processing. */
-  *L = ensure_finite3(*L);
+  *L = ensure_finite(*L);
 
 #ifdef __CLAMP_SAMPLE__
   float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
@@ -121,55 +121,49 @@ ccl_device_forceinline void kernel_accum_clamp(KernelGlobals kg, ccl_private flo
  * Pass accumulation utilities.
  */
 
-/* Get pointer to pixel in render buffer. */
-ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
-    KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
-{
-  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                        kernel_data.film.pass_stride;
-  return render_buffer + render_buffer_offset;
-}
-
 /* --------------------------------------------------------------------
  * Adaptive sampling.
  */
 
-ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
-                                          ConstIntegratorState state,
-                                          ccl_global float *ccl_restrict render_buffer,
-                                          int sample,
-                                          int sample_offset)
+ccl_device_inline int film_write_sample(KernelGlobals kg,
+                                        ConstIntegratorState state,
+                                        ccl_global float *ccl_restrict render_buffer,
+                                        int sample,
+                                        int sample_offset)
 {
   if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
     return sample;
   }
 
-  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
 
   return atomic_fetch_and_add_uint32(
              (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
          sample_offset;
 }
 
-ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
-                                             const int sample,
-                                             const float3 contribution,
-                                             ccl_global float *ccl_restrict buffer)
+ccl_device void film_write_adaptive_buffer(KernelGlobals kg,
+                                           const int sample,
+                                           const Spectrum contribution,
+                                           ccl_global float *ccl_restrict buffer)
 {
-  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
-   * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
-   * Carlo global illumination" except that here it is applied per pixel and not in hierarchical
-   * tiles. */
+  /* Adaptive Sampling. Fill the additional buffer with only one half of the samples and
+   * calculate our stopping criteria. This is the heuristic from "A hierarchical automatic
+   * stopping condition for Monte Carlo global illumination" except that here it is applied
+   * per pixel and not in hierarchical tiles. */
 
   if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
     return;
   }
 
-  if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
-    kernel_write_pass_float4(
-        buffer + kernel_data.film.pass_adaptive_aux_buffer,
-        make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f));
+  if (sample_is_class_A(kernel_data.integrator.sampling_pattern, sample)) {
+    const float3 contribution_rgb = spectrum_to_rgb(contribution);
+
+    film_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
+                           make_float4(contribution_rgb.x * 2.0f,
+                                       contribution_rgb.y * 2.0f,
+                                       contribution_rgb.z * 2.0f,
+                                       0.0f));
   }
 }
 
@@ -184,10 +178,10 @@ ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
  * Returns truth if the contribution is fully handled here and is not to be added to the other
  * passes (like combined, adaptive sampling). */
 
-ccl_device bool kernel_accum_shadow_catcher(KernelGlobals kg,
-                                            const uint32_t path_flag,
-                                            const float3 contribution,
-                                            ccl_global float *ccl_restrict buffer)
+ccl_device bool film_write_shadow_catcher(KernelGlobals kg,
+                                          const uint32_t path_flag,
+                                          const Spectrum contribution,
+                                          ccl_global float *ccl_restrict buffer)
 {
   if (!kernel_data.integrator.has_shadow_catcher) {
     return false;
@@ -198,7 +192,7 @@ ccl_device bool kernel_accum_shadow_catcher(KernelGlobals kg,
 
   /* Matte pass. */
   if (kernel_shadow_catcher_is_matte_path(path_flag)) {
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
     /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
      * sampling is based on how noisy the combined pass is as if there were no catchers in the
      * scene. */
@@ -206,18 +200,18 @@ ccl_device bool kernel_accum_shadow_catcher(KernelGlobals kg,
 
   /* Shadow catcher pass. */
   if (kernel_shadow_catcher_is_object_pass(path_flag)) {
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_shadow_catcher, contribution);
     return true;
   }
 
   return false;
 }
 
-ccl_device bool kernel_accum_shadow_catcher_transparent(KernelGlobals kg,
-                                                        const uint32_t path_flag,
-                                                        const float3 contribution,
-                                                        const float transparent,
-                                                        ccl_global float *ccl_restrict buffer)
+ccl_device bool film_write_shadow_catcher_transparent(KernelGlobals kg,
+                                                      const uint32_t path_flag,
+                                                      const Spectrum contribution,
+                                                      const float transparent,
+                                                      ccl_global float *ccl_restrict buffer)
 {
   if (!kernel_data.integrator.has_shadow_catcher) {
     return false;
@@ -232,9 +226,11 @@ ccl_device bool kernel_accum_shadow_catcher_transparent(KernelGlobals kg,
 
   /* Matte pass. */
   if (kernel_shadow_catcher_is_matte_path(path_flag)) {
-    kernel_write_pass_float4(
+    const float3 contribution_rgb = spectrum_to_rgb(contribution);
+
+    film_write_pass_float4(
         buffer + kernel_data.film.pass_shadow_catcher_matte,
-        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+        make_float4(contribution_rgb.x, contribution_rgb.y, contribution_rgb.z, transparent));
     /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
      * sampling is based on how noisy the combined pass is as if there were no catchers in the
      * scene. */
@@ -245,17 +241,17 @@ ccl_device bool kernel_accum_shadow_catcher_transparent(KernelGlobals kg,
     /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the
      * calculation and the alpha channel of the pass contains numbers of samples contributed to a
      * pixel of the pass. */
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_shadow_catcher, contribution);
     return true;
   }
 
   return false;
 }
 
-ccl_device void kernel_accum_shadow_catcher_transparent_only(KernelGlobals kg,
-                                                             const uint32_t path_flag,
-                                                             const float transparent,
-                                                             ccl_global float *ccl_restrict buffer)
+ccl_device void film_write_shadow_catcher_transparent_only(KernelGlobals kg,
+                                                           const uint32_t path_flag,
+                                                           const float transparent,
+                                                           ccl_global float *ccl_restrict buffer)
 {
   if (!kernel_data.integrator.has_shadow_catcher) {
     return;
@@ -265,10 +261,29 @@ ccl_device void kernel_accum_shadow_catcher_transparent_only(KernelGlobals kg,
 
   /* Matte pass. */
   if (kernel_shadow_catcher_is_matte_path(path_flag)) {
-    kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
+    film_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
   }
 }
 
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void film_write_shadow_catcher_bounce_data(
+    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
+{
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  film_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+  film_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                        average(throughput));
+}
+
 #endif /* __SHADOW_CATCHER__ */
 
 /* --------------------------------------------------------------------
@@ -276,54 +291,55 @@ ccl_device void kernel_accum_shadow_catcher_transparent_only(KernelGlobals kg,
  */
 
 /* Write combined pass. */
-ccl_device_inline void kernel_accum_combined_pass(KernelGlobals kg,
-                                                  const uint32_t path_flag,
-                                                  const int sample,
-                                                  const float3 contribution,
-                                                  ccl_global float *ccl_restrict buffer)
+ccl_device_inline void film_write_combined_pass(KernelGlobals kg,
+                                                const uint32_t path_flag,
+                                                const int sample,
+                                                const Spectrum contribution,
+                                                ccl_global float *ccl_restrict buffer)
 {
 #ifdef __SHADOW_CATCHER__
-  if (kernel_accum_shadow_catcher(kg, path_flag, contribution, buffer)) {
+  if (film_write_shadow_catcher(kg, path_flag, contribution, buffer)) {
     return;
   }
 #endif
 
   if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution);
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_combined, contribution);
   }
 
-  kernel_accum_adaptive_buffer(kg, sample, contribution, buffer);
+  film_write_adaptive_buffer(kg, sample, contribution, buffer);
 }
 
 /* Write combined pass with transparency. */
-ccl_device_inline void kernel_accum_combined_transparent_pass(KernelGlobals kg,
-                                                              const uint32_t path_flag,
-                                                              const int sample,
-                                                              const float3 contribution,
-                                                              const float transparent,
-                                                              ccl_global float *ccl_restrict
-                                                                  buffer)
+ccl_device_inline void film_write_combined_transparent_pass(KernelGlobals kg,
+                                                            const uint32_t path_flag,
+                                                            const int sample,
+                                                            const Spectrum contribution,
+                                                            const float transparent,
+                                                            ccl_global float *ccl_restrict buffer)
 {
 #ifdef __SHADOW_CATCHER__
-  if (kernel_accum_shadow_catcher_transparent(kg, path_flag, contribution, transparent, buffer)) {
+  if (film_write_shadow_catcher_transparent(kg, path_flag, contribution, transparent, buffer)) {
     return;
   }
 #endif
 
   if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
-    kernel_write_pass_float4(
+    const float3 contribution_rgb = spectrum_to_rgb(contribution);
+
+    film_write_pass_float4(
         buffer + kernel_data.film.pass_combined,
-        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+        make_float4(contribution_rgb.x, contribution_rgb.y, contribution_rgb.z, transparent));
   }
 
-  kernel_accum_adaptive_buffer(kg, sample, contribution, buffer);
+  film_write_adaptive_buffer(kg, sample, contribution, buffer);
 }
 
 /* Write background or emission to appropriate pass. */
-ccl_device_inline void kernel_accum_emission_or_background_pass(
+ccl_device_inline void film_write_emission_or_background_pass(
     KernelGlobals kg,
     ConstIntegratorState state,
-    float3 contribution,
+    Spectrum contribution,
     ccl_global float *ccl_restrict buffer,
     const int pass,
     const int lightgroup = LIGHTGROUP_NONE)
@@ -340,16 +356,16 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(
 #  ifdef __DENOISING_FEATURES__
   if (path_flag & PATH_RAY_DENOISING_FEATURES) {
     if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
-      const float3 denoising_feature_throughput = INTEGRATOR_STATE(
+      const Spectrum denoising_feature_throughput = INTEGRATOR_STATE(
           state, path, denoising_feature_throughput);
-      const float3 denoising_albedo = denoising_feature_throughput * contribution;
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+      const Spectrum denoising_albedo = denoising_feature_throughput * contribution;
+      film_write_pass_spectrum(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
     }
   }
 #  endif /* __DENOISING_FEATURES__ */
 
   if (lightgroup != LIGHTGROUP_NONE && kernel_data.film.pass_lightgroup != PASS_UNUSED) {
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_lightgroup + 3 * lightgroup,
+    film_write_pass_spectrum(buffer + kernel_data.film.pass_lightgroup + 3 * lightgroup,
                              contribution);
   }
 
@@ -366,15 +382,15 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(
 
     if (path_flag & PATH_RAY_SURFACE_PASS) {
       /* Indirectly visible through reflection. */
-      const float3 diffuse_weight = INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-      const float3 glossy_weight = INTEGRATOR_STATE(state, path, pass_glossy_weight);
+      const Spectrum diffuse_weight = INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+      const Spectrum glossy_weight = INTEGRATOR_STATE(state, path, pass_glossy_weight);
 
       /* Glossy */
       const int glossy_pass_offset = ((INTEGRATOR_STATE(state, path, bounce) == 1) ?
                                           kernel_data.film.pass_glossy_direct :
                                           kernel_data.film.pass_glossy_indirect);
       if (glossy_pass_offset != PASS_UNUSED) {
-        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
+        film_write_pass_spectrum(buffer + glossy_pass_offset, glossy_weight * contribution);
       }
 
       /* Transmission */
@@ -385,8 +401,8 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(
       if (transmission_pass_offset != PASS_UNUSED) {
         /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
          * GPU memory. */
-        const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
-        kernel_write_pass_float3(buffer + transmission_pass_offset,
+        const Spectrum transmission_weight = one_spectrum() - diffuse_weight - glossy_weight;
+        film_write_pass_spectrum(buffer + transmission_pass_offset,
                                  transmission_weight * contribution);
       }
 
@@ -408,19 +424,19 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(
 
   /* Single write call for GPU coherence. */
   if (pass_offset != PASS_UNUSED) {
-    kernel_write_pass_float3(buffer + pass_offset, contribution);
+    film_write_pass_spectrum(buffer + pass_offset, contribution);
   }
 #endif /* __PASSES__ */
 }
 
 /* Write light contribution to render buffer. */
-ccl_device_inline void kernel_accum_light(KernelGlobals kg,
-                                          ConstIntegratorShadowState state,
-                                          ccl_global float *ccl_restrict render_buffer)
+ccl_device_inline void film_write_direct_light(KernelGlobals kg,
+                                               ConstIntegratorShadowState state,
+                                               ccl_global float *ccl_restrict render_buffer)
 {
   /* The throughput for shadow paths already contains the light shader evaluation. */
-  float3 contribution = INTEGRATOR_STATE(state, shadow_path, throughput);
-  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, shadow_path, bounce));
+  Spectrum contribution = INTEGRATOR_STATE(state, shadow_path, throughput);
+  film_clamp_light(kg, &contribution, INTEGRATOR_STATE(state, shadow_path, bounce));
 
   const uint32_t render_pixel_index = INTEGRATOR_STATE(state, shadow_path, render_pixel_index);
   const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
@@ -433,17 +449,17 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
   /* Ambient occlusion. */
   if (path_flag & PATH_RAY_SHADOW_FOR_AO) {
     if ((kernel_data.kernel_features & KERNEL_FEATURE_AO_PASS) && (path_flag & PATH_RAY_CAMERA)) {
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, contribution);
+      film_write_pass_spectrum(buffer + kernel_data.film.pass_ao, contribution);
     }
     if (kernel_data.kernel_features & KERNEL_FEATURE_AO_ADDITIVE) {
-      const float3 ao_weight = INTEGRATOR_STATE(state, shadow_path, unshadowed_throughput);
-      kernel_accum_combined_pass(kg, path_flag, sample, contribution * ao_weight, buffer);
+      const Spectrum ao_weight = INTEGRATOR_STATE(state, shadow_path, unshadowed_throughput);
+      film_write_combined_pass(kg, path_flag, sample, contribution * ao_weight, buffer);
     }
     return;
   }
 
   /* Direct light shadow. */
-  kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
+  film_write_combined_pass(kg, path_flag, sample, contribution, buffer);
 
 #ifdef __PASSES__
   if (kernel_data.film.light_pass_flag & PASS_ANY) {
@@ -458,7 +474,7 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
     /* Write lightgroup pass. LIGHTGROUP_NONE is ~0 so decode from unsigned to signed */
     const int lightgroup = (int)(INTEGRATOR_STATE(state, shadow_path, lightgroup)) - 1;
     if (lightgroup != LIGHTGROUP_NONE && kernel_data.film.pass_lightgroup != PASS_UNUSED) {
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_lightgroup + 3 * lightgroup,
+      film_write_pass_spectrum(buffer + kernel_data.film.pass_lightgroup + 3 * lightgroup,
                                contribution);
     }
 
@@ -467,15 +483,15 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
 
       if (path_flag & PATH_RAY_SURFACE_PASS) {
         /* Indirectly visible through reflection. */
-        const float3 diffuse_weight = INTEGRATOR_STATE(state, shadow_path, pass_diffuse_weight);
-        const float3 glossy_weight = INTEGRATOR_STATE(state, shadow_path, pass_glossy_weight);
+        const Spectrum diffuse_weight = INTEGRATOR_STATE(state, shadow_path, pass_diffuse_weight);
+        const Spectrum glossy_weight = INTEGRATOR_STATE(state, shadow_path, pass_glossy_weight);
 
         /* Glossy */
         const int glossy_pass_offset = ((INTEGRATOR_STATE(state, shadow_path, bounce) == 0) ?
                                             kernel_data.film.pass_glossy_direct :
                                             kernel_data.film.pass_glossy_indirect);
         if (glossy_pass_offset != PASS_UNUSED) {
-          kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_weight * contribution);
+          film_write_pass_spectrum(buffer + glossy_pass_offset, glossy_weight * contribution);
         }
 
         /* Transmission */
@@ -486,8 +502,8 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
         if (transmission_pass_offset != PASS_UNUSED) {
           /* Transmission is what remains if not diffuse and glossy, not stored explicitly to save
            * GPU memory. */
-          const float3 transmission_weight = one_float3() - diffuse_weight - glossy_weight;
-          kernel_write_pass_float3(buffer + transmission_pass_offset,
+          const Spectrum transmission_weight = one_spectrum() - diffuse_weight - glossy_weight;
+          film_write_pass_spectrum(buffer + transmission_pass_offset,
                                    transmission_weight * contribution);
         }
 
@@ -508,19 +524,19 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
 
       /* Single write call for GPU coherence. */
       if (pass_offset != PASS_UNUSED) {
-        kernel_write_pass_float3(buffer + pass_offset, contribution);
+        film_write_pass_spectrum(buffer + pass_offset, contribution);
       }
     }
 
     /* Write shadow pass. */
     if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
         (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-      const float3 unshadowed_throughput = INTEGRATOR_STATE(
+      const Spectrum unshadowed_throughput = INTEGRATOR_STATE(
           state, shadow_path, unshadowed_throughput);
-      const float3 shadowed_throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
-      const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) *
-                            kernel_data.film.pass_shadow_scale;
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow);
+      const Spectrum shadowed_throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
+      const Spectrum shadow = safe_divide(shadowed_throughput, unshadowed_throughput) *
+                              kernel_data.film.pass_shadow_scale;
+      film_write_pass_spectrum(buffer + kernel_data.film.pass_shadow, shadow);
     }
   }
 #endif
@@ -531,78 +547,96 @@ ccl_device_inline void kernel_accum_light(KernelGlobals kg,
  * Note that we accumulate transparency = 1 - alpha in the render buffer.
  * Otherwise we'd have to write alpha on path termination, which happens
  * in many places. */
-ccl_device_inline void kernel_accum_transparent(KernelGlobals kg,
-                                                ConstIntegratorState state,
-                                                const uint32_t path_flag,
-                                                const float transparent,
-                                                ccl_global float *ccl_restrict buffer)
+ccl_device_inline void film_write_transparent(KernelGlobals kg,
+                                              ConstIntegratorState state,
+                                              const uint32_t path_flag,
+                                              const float transparent,
+                                              ccl_global float *ccl_restrict buffer)
 {
   if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
-    kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
+    film_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
   }
 
-  kernel_accum_shadow_catcher_transparent_only(kg, path_flag, transparent, buffer);
+  film_write_shadow_catcher_transparent_only(kg, path_flag, transparent, buffer);
 }
 
 /* Write holdout to render buffer. */
-ccl_device_inline void kernel_accum_holdout(KernelGlobals kg,
-                                            ConstIntegratorState state,
-                                            const uint32_t path_flag,
-                                            const float transparent,
-                                            ccl_global float *ccl_restrict render_buffer)
+ccl_device_inline void film_write_holdout(KernelGlobals kg,
+                                          ConstIntegratorState state,
+                                          const uint32_t path_flag,
+                                          const float transparent,
+                                          ccl_global float *ccl_restrict render_buffer)
 {
-  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
-  kernel_accum_transparent(kg, state, path_flag, transparent, buffer);
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+  film_write_transparent(kg, state, path_flag, transparent, buffer);
 }
 
 /* Write background contribution to render buffer.
  *
- * Includes transparency, matching kernel_accum_transparent. */
-ccl_device_inline void kernel_accum_background(KernelGlobals kg,
-                                               ConstIntegratorState state,
-                                               const float3 L,
-                                               const float transparent,
-                                               const bool is_transparent_background_ray,
-                                               ccl_global float *ccl_restrict render_buffer)
+ * Includes transparency, matching film_write_transparent. */
+ccl_device_inline void film_write_background(KernelGlobals kg,
+                                             ConstIntegratorState state,
+                                             const Spectrum L,
+                                             const float transparent,
+                                             const bool is_transparent_background_ray,
+                                             ccl_global float *ccl_restrict render_buffer)
 {
-  float3 contribution = float3(INTEGRATOR_STATE(state, path, throughput)) * L;
-  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
+  Spectrum contribution = INTEGRATOR_STATE(state, path, throughput) * L;
+  film_clamp_light(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
 
-  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
 
   if (is_transparent_background_ray) {
-    kernel_accum_transparent(kg, state, path_flag, transparent, buffer);
+    film_write_transparent(kg, state, path_flag, transparent, buffer);
   }
   else {
     const int sample = INTEGRATOR_STATE(state, path, sample);
-    kernel_accum_combined_transparent_pass(
-        kg, path_flag, sample, contribution, transparent, buffer);
-  }
-  kernel_accum_emission_or_background_pass(kg,
-                                           state,
-                                           contribution,
-                                           buffer,
-                                           kernel_data.film.pass_background,
-                                           kernel_data.background.lightgroup);
+    film_write_combined_transparent_pass(kg, path_flag, sample, contribution, transparent, buffer);
+  }
+  film_write_emission_or_background_pass(kg,
+                                         state,
+                                         contribution,
+                                         buffer,
+                                         kernel_data.film.pass_background,
+                                         kernel_data.background.lightgroup);
 }
 
 /* Write emission to render buffer. */
-ccl_device_inline void kernel_accum_emission(KernelGlobals kg,
-                                             ConstIntegratorState state,
-                                             const float3 L,
-                                             ccl_global float *ccl_restrict render_buffer,
-                                             const int lightgroup = LIGHTGROUP_NONE)
+ccl_device_inline void film_write_volume_emission(KernelGlobals kg,
+                                                  ConstIntegratorState state,
+                                                  const Spectrum L,
+                                                  ccl_global float *ccl_restrict render_buffer,
+                                                  const int lightgroup = LIGHTGROUP_NONE)
+{
+  Spectrum contribution = L;
+  film_clamp_light(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
+
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
+  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
+  const int sample = INTEGRATOR_STATE(state, path, sample);
+
+  film_write_combined_pass(kg, path_flag, sample, contribution, buffer);
+  film_write_emission_or_background_pass(
+      kg, state, contribution, buffer, kernel_data.film.pass_emission, lightgroup);
+}
+
+ccl_device_inline void film_write_surface_emission(KernelGlobals kg,
+                                                   ConstIntegratorState state,
+                                                   const Spectrum L,
+                                                   const float mis_weight,
+                                                   ccl_global float *ccl_restrict render_buffer,
+                                                   const int lightgroup = LIGHTGROUP_NONE)
 {
-  float3 contribution = L;
-  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
+  Spectrum contribution = INTEGRATOR_STATE(state, path, throughput) * L * mis_weight;
+  film_clamp_light(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);
 
-  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
+  ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
   const int sample = INTEGRATOR_STATE(state, path, sample);
 
-  kernel_accum_combined_pass(kg, path_flag, sample, contribution, buffer);
-  kernel_accum_emission_or_background_pass(
+  film_write_combined_pass(kg, path_flag, sample, contribution, buffer);
+  film_write_emission_or_background_pass(
       kg, state, contribution, buffer, kernel_data.film.pass_emission, lightgroup);
 }
 
diff --git a/intern/cycles/kernel/film/passes.h b/intern/cycles/kernel/film/passes.h
deleted file mode 100644
index 773f5726850..00000000000
--- a/intern/cycles/kernel/film/passes.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#pragma once
-
-#include "kernel/geom/geom.h"
-
-#include "kernel/film/id_passes.h"
-#include "kernel/film/write_passes.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Get pointer to pixel in render buffer. */
-ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer(
-    KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
-{
-  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                        kernel_data.film.pass_stride;
-  return render_buffer + render_buffer_offset;
-}
-
-#ifdef __DENOISING_FEATURES__
-
-ccl_device_forceinline void kernel_write_denoising_features_surface(
-    KernelGlobals kg,
-    IntegratorState state,
-    ccl_private const ShaderData *sd,
-    ccl_global float *ccl_restrict render_buffer)
-{
-  if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_DENOISING_FEATURES)) {
-    return;
-  }
-
-  /* Skip implicitly transparent surfaces. */
-  if (sd->flag & SD_HAS_ONLY_VOLUME) {
-    return;
-  }
-
-  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
-
-  if (kernel_data.film.pass_denoising_depth != PASS_UNUSED) {
-    const float3 denoising_feature_throughput = INTEGRATOR_STATE(
-        state, path, denoising_feature_throughput);
-    const float denoising_depth = ensure_finite(average(denoising_feature_throughput) *
-                                                sd->ray_length);
-    kernel_write_pass_float(buffer + kernel_data.film.pass_denoising_depth, denoising_depth);
-  }
-
-  float3 normal = zero_float3();
-  float3 diffuse_albedo = zero_float3();
-  float3 specular_albedo = zero_float3();
-  float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-      continue;
-    }
-
-    /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
-    normal += sc->N * sc->sample_weight;
-    sum_weight += sc->sample_weight;
-
-    float3 closure_albedo = sc->weight;
-    /* Closures that include a Fresnel term typically have weights close to 1 even though their
-     * actual contribution is significantly lower.
-     * To account for this, we scale their weight by the average fresnel factor (the same is also
-     * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
-    if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
-      ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)sc;
-      closure_albedo *= bsdf->extra->fresnel_color;
-    }
-    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
-      ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)sc;
-      closure_albedo *= bsdf->avg_value;
-    }
-    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
-      closure_albedo *= bsdf_principled_hair_albedo(sc);
-    }
-    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
-      /* BSSRDF already accounts for weight, retro-reflection would double up. */
-      ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)
-          sc;
-      if (bsdf->components == PRINCIPLED_DIFFUSE_RETRO_REFLECTION) {
-        continue;
-      }
-    }
-
-    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
-      diffuse_albedo += closure_albedo;
-      sum_nonspecular_weight += sc->sample_weight;
-    }
-    else {
-      specular_albedo += closure_albedo;
-    }
-  }
-
-  /* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
-  if ((sum_weight == 0.0f) || (sum_nonspecular_weight * 4.0f > sum_weight)) {
-    if (sum_weight != 0.0f) {
-      normal /= sum_weight;
-    }
-
-    if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
-      /* Transform normal into camera space. */
-      const Transform worldtocamera = kernel_data.cam.worldtocamera;
-      normal = transform_direction(&worldtocamera, normal);
-
-      const float3 denoising_normal = ensure_finite3(normal);
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
-    }
-
-    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
-      const float3 denoising_feature_throughput = INTEGRATOR_STATE(
-          state, path, denoising_feature_throughput);
-      const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
-                                                     diffuse_albedo);
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
-    }
-
-    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
-  }
-  else {
-    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) *= specular_albedo;
-  }
-}
-
-ccl_device_forceinline void kernel_write_denoising_features_volume(KernelGlobals kg,
-                                                                   IntegratorState state,
-                                                                   const float3 albedo,
-                                                                   const bool scatter,
-                                                                   ccl_global float *ccl_restrict
-                                                                       render_buffer)
-{
-  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
-  const float3 denoising_feature_throughput = INTEGRATOR_STATE(
-      state, path, denoising_feature_throughput);
-
-  if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
-    /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
-    INTEGRATOR_STATE_WRITE(state, path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
-
-    /* Write view direction as normal. */
-    const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
-  }
-
-  if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
-    /* Write albedo. */
-    const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo);
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
-  }
-}
-#endif /* __DENOISING_FEATURES__ */
-
-ccl_device_inline size_t kernel_write_id_pass(ccl_global float *ccl_restrict buffer,
-                                              size_t depth,
-                                              float id,
-                                              float matte_weight)
-{
-  kernel_write_id_slots(buffer, depth * 2, id, matte_weight);
-  return depth * 4;
-}
-
-ccl_device_inline void kernel_write_data_passes(KernelGlobals kg,
-                                                IntegratorState state,
-                                                ccl_private const ShaderData *sd,
-                                                ccl_global float *ccl_restrict render_buffer)
-{
-#ifdef __PASSES__
-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-
-  if (!(path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    return;
-  }
-
-  const int flag = kernel_data.film.pass_flag;
-
-  if (!(flag & PASS_ANY)) {
-    return;
-  }
-
-  ccl_global float *buffer = kernel_pass_pixel_render_buffer(kg, state, render_buffer);
-
-  if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-    if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
-        average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
-      if (INTEGRATOR_STATE(state, path, sample) == 0) {
-        if (flag & PASSMASK(DEPTH)) {
-          const float depth = camera_z_depth(kg, sd->P);
-          kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
-        }
-        if (flag & PASSMASK(OBJECT_ID)) {
-          const float id = object_pass_id(kg, sd->object);
-          kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
-        }
-        if (flag & PASSMASK(MATERIAL_ID)) {
-          const float id = shader_pass_id(kg, sd);
-          kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
-        }
-        if (flag & PASSMASK(POSITION)) {
-          const float3 position = sd->P;
-          kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position);
-        }
-      }
-
-      if (flag & PASSMASK(NORMAL)) {
-        const float3 normal = shader_bsdf_average_normal(kg, sd);
-        kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
-      }
-      if (flag & PASSMASK(ROUGHNESS)) {
-        const float roughness = shader_bsdf_average_roughness(sd);
-        kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
-      }
-      if (flag & PASSMASK(UV)) {
-        const float3 uv = primitive_uv(kg, sd);
-        kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
-      }
-      if (flag & PASSMASK(MOTION)) {
-        const float4 speed = primitive_motion_vector(kg, sd);
-        kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
-        kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
-      }
-
-      INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
-    }
-  }
-
-  if (kernel_data.film.cryptomatte_passes) {
-    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-    const float matte_weight = average(throughput) *
-                               (1.0f - average(shader_bsdf_transparency(kg, sd)));
-    if (matte_weight > 0.0f) {
-      ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
-      if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-        const float id = object_cryptomatte_id(kg, sd->object);
-        cryptomatte_buffer += kernel_write_id_pass(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
-      }
-      if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-        const float id = shader_cryptomatte_id(kg, sd->shader);
-        cryptomatte_buffer += kernel_write_id_pass(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
-      }
-      if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-        const float id = object_cryptomatte_asset_id(kg, sd->object);
-        cryptomatte_buffer += kernel_write_id_pass(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
-      }
-    }
-  }
-
-  if (flag & PASSMASK(DIFFUSE_COLOR)) {
-    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color,
-                             shader_bsdf_diffuse(kg, sd) * throughput);
-  }
-  if (flag & PASSMASK(GLOSSY_COLOR)) {
-    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color,
-                             shader_bsdf_glossy(kg, sd) * throughput);
-  }
-  if (flag & PASSMASK(TRANSMISSION_COLOR)) {
-    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
-                             shader_bsdf_transmission(kg, sd) * throughput);
-  }
-  if (flag & PASSMASK(MIST)) {
-    /* Bring depth into 0..1 range. */
-    const float mist_start = kernel_data.film.mist_start;
-    const float mist_inv_depth = kernel_data.film.mist_inv_depth;
-
-    const float depth = camera_distance(kg, sd->P);
-    float mist = saturatef((depth - mist_start) * mist_inv_depth);
-
-    /* Falloff */
-    const float mist_falloff = kernel_data.film.mist_falloff;
-
-    if (mist_falloff == 1.0f)
-      ;
-    else if (mist_falloff == 2.0f)
-      mist = mist * mist;
-    else if (mist_falloff == 0.5f)
-      mist = sqrtf(mist);
-    else
-      mist = powf(mist, mist_falloff);
-
-    /* Modulate by transparency */
-    const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-    const float3 alpha = shader_bsdf_alpha(kg, sd);
-    const float mist_output = (1.0f - mist) * average(throughput * alpha);
-
-    /* Note that the final value in the render buffer we want is 1 - mist_output,
-     * to avoid having to tracking this in the Integrator state we do the negation
-     * after rendering. */
-    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
-  }
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/film/read.h b/intern/cycles/kernel/film/read.h
index a0236909f4b..108f992e29d 100644
--- a/intern/cycles/kernel/film/read.h
+++ b/intern/cycles/kernel/film/read.h
@@ -1,6 +1,10 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
+/* Functions to retrieving render passes for display or output. Reading from
+ * the raw render buffer and normalizing based on the number of samples,
+ * computing alpha, compositing shadow catchers, etc. */
+
 #pragma once
 
 CCL_NAMESPACE_BEGIN
@@ -235,6 +239,21 @@ ccl_device_inline void film_get_pass_pixel_float3(ccl_global const KernelFilmCon
   pixel[0] = f.x;
   pixel[1] = f.y;
   pixel[2] = f.z;
+
+  /* Optional alpha channel. */
+  if (kfilm_convert->num_components >= 4) {
+    if (kfilm_convert->pass_combined != PASS_UNUSED) {
+      float scale, scale_exposure;
+      film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+      ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+      const float alpha = in_combined[3] * scale;
+      pixel[3] = film_transparency_to_alpha(alpha);
+    }
+    else {
+      pixel[3] = 1.0f;
+    }
+  }
 }
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/film/write_passes.h b/intern/cycles/kernel/film/write.h
index 9148d73518f..c630a522ee3 100644
--- a/intern/cycles/kernel/film/write_passes.h
+++ b/intern/cycles/kernel/film/write.h
@@ -3,13 +3,26 @@
 
 #pragma once
 
+#include "kernel/util/color.h"
+
 #ifdef __KERNEL_GPU__
 #  define __ATOMIC_PASS_WRITE__
 #endif
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *film_pass_pixel_render_buffer(
+    KernelGlobals kg, ConstIntegratorState state, ccl_global float *ccl_restrict render_buffer)
+{
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
+}
+
+/* Write to pixel. */
+ccl_device_inline void film_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   atomic_add_and_fetch_float(buffer, value);
@@ -18,8 +31,7 @@ ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict bu
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer,
-                                                float3 value)
+ccl_device_inline void film_write_pass_float3(ccl_global float *ccl_restrict buffer, float3 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -36,8 +48,13 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict b
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer,
-                                                float4 value)
+ccl_device_inline void film_write_pass_spectrum(ccl_global float *ccl_restrict buffer,
+                                                Spectrum value)
+{
+  film_write_pass_float3(buffer, spectrum_to_rgb(value));
+}
+
+ccl_device_inline void film_write_pass_float4(ccl_global float *ccl_restrict buffer, float4 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
diff --git a/intern/cycles/kernel/geom/attribute.h b/intern/cycles/kernel/geom/attribute.h
index 774b25a76ff..3a0ee1b09d1 100644
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -16,14 +16,14 @@ CCL_NAMESPACE_BEGIN
 
 /* Patch index for triangle, -1 if not subdivision triangle */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, int prim)
 {
-  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+  return (prim != PRIM_NONE) ? kernel_data_fetch(tri_patch, prim) : ~0;
 }
 
-ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
+ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, int prim, int type)
 {
-  if ((sd->type & PRIMITIVE_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
+  if ((type & PRIMITIVE_TRIANGLE) && subd_triangle_patch(kg, prim) != ~0) {
     return ATTR_PRIM_SUBD;
   }
   else {
@@ -42,21 +42,20 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 ccl_device_inline uint object_attribute_map_offset(KernelGlobals kg, int object)
 {
-  return kernel_tex_fetch(__objects, object).attribute_map_offset;
+  return kernel_data_fetch(objects, object).attribute_map_offset;
 }
 
-ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals kg,
-                                                     ccl_private const ShaderData *sd,
-                                                     uint id)
+ccl_device_inline AttributeDescriptor
+find_attribute(KernelGlobals kg, int object, int prim, int type, uint64_t id)
 {
-  if (sd->object == OBJECT_NONE) {
+  if (object == OBJECT_NONE) {
     return attribute_not_found();
   }
 
   /* for SVM, find attribute by unique id */
-  uint attr_offset = object_attribute_map_offset(kg, sd->object);
-  attr_offset += attribute_primitive_type(kg, sd);
-  AttributeMap attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+  uint attr_offset = object_attribute_map_offset(kg, object);
+  attr_offset += attribute_primitive_type(kg, prim, type);
+  AttributeMap attr_map = kernel_data_fetch(attributes_map, attr_offset);
 
   while (attr_map.id != id) {
     if (UNLIKELY(attr_map.id == ATTR_STD_NONE)) {
@@ -71,13 +70,13 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals kg,
     else {
       attr_offset += ATTR_PRIM_TYPES;
     }
-    attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
+    attr_map = kernel_data_fetch(attributes_map, attr_offset);
   }
 
   AttributeDescriptor desc;
   desc.element = (AttributeElement)attr_map.element;
 
-  if (sd->prim == PRIM_NONE && desc.element != ATTR_ELEMENT_MESH &&
+  if (prim == PRIM_NONE && desc.element != ATTR_ELEMENT_MESH &&
       desc.element != ATTR_ELEMENT_VOXEL && desc.element != ATTR_ELEMENT_OBJECT) {
     return attribute_not_found();
   }
@@ -91,17 +90,22 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals kg,
   return desc;
 }
 
+ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals kg,
+                                                     ccl_private const ShaderData *sd,
+                                                     uint64_t id)
+{
+  return find_attribute(kg, sd->object, sd->prim, sd->type, id);
+}
+
 /* Transform matrix attribute on meshes */
 
-ccl_device Transform primitive_attribute_matrix(KernelGlobals kg,
-                                                ccl_private const ShaderData *sd,
-                                                const AttributeDescriptor desc)
+ccl_device Transform primitive_attribute_matrix(KernelGlobals kg, const AttributeDescriptor desc)
 {
   Transform tfm;
 
-  tfm.x = kernel_tex_fetch(__attributes_float4, desc.offset + 0);
-  tfm.y = kernel_tex_fetch(__attributes_float4, desc.offset + 1);
-  tfm.z = kernel_tex_fetch(__attributes_float4, desc.offset + 2);
+  tfm.x = kernel_data_fetch(attributes_float4, desc.offset + 0);
+  tfm.y = kernel_data_fetch(attributes_float4, desc.offset + 1);
+  tfm.z = kernel_data_fetch(attributes_float4, desc.offset + 2);
 
   return tfm;
 }
diff --git a/intern/cycles/kernel/geom/curve.h b/intern/cycles/kernel/geom/curve.h
index 4dbc6d4f6db..e243adfde21 100644
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -23,12 +23,12 @@ ccl_device float curve_attribute_float(KernelGlobals kg,
                                        ccl_private float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
-    KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
+    KernelCurve curve = kernel_data_fetch(curves, sd->prim);
     int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
     int k1 = k0 + 1;
 
-    float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
-    float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
+    float f0 = kernel_data_fetch(attributes_float, desc.offset + k0);
+    float f1 = kernel_data_fetch(attributes_float, desc.offset + k1);
 
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -50,7 +50,7 @@ ccl_device float curve_attribute_float(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                 desc.offset;
-      return kernel_tex_fetch(__attributes_float, offset);
+      return kernel_data_fetch(attributes_float, offset);
     }
     else {
       return 0.0f;
@@ -65,12 +65,12 @@ ccl_device float2 curve_attribute_float2(KernelGlobals kg,
                                          ccl_private float2 *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
-    KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
+    KernelCurve curve = kernel_data_fetch(curves, sd->prim);
     int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
     int k1 = k0 + 1;
 
-    float2 f0 = kernel_tex_fetch(__attributes_float2, desc.offset + k0);
-    float2 f1 = kernel_tex_fetch(__attributes_float2, desc.offset + k1);
+    float2 f0 = kernel_data_fetch(attributes_float2, desc.offset + k0);
+    float2 f1 = kernel_data_fetch(attributes_float2, desc.offset + k1);
 
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -96,7 +96,7 @@ ccl_device float2 curve_attribute_float2(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                 desc.offset;
-      return kernel_tex_fetch(__attributes_float2, offset);
+      return kernel_data_fetch(attributes_float2, offset);
     }
     else {
       return make_float2(0.0f, 0.0f);
@@ -111,12 +111,12 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
                                          ccl_private float3 *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
-    KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
+    KernelCurve curve = kernel_data_fetch(curves, sd->prim);
     int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
     int k1 = k0 + 1;
 
-    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
-    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+    float3 f0 = kernel_data_fetch(attributes_float3, desc.offset + k0);
+    float3 f1 = kernel_data_fetch(attributes_float3, desc.offset + k1);
 
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -138,7 +138,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                 desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_data_fetch(attributes_float3, offset);
     }
     else {
       return make_float3(0.0f, 0.0f, 0.0f);
@@ -153,12 +153,12 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
                                          ccl_private float4 *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
-    KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
+    KernelCurve curve = kernel_data_fetch(curves, sd->prim);
     int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
     int k1 = k0 + 1;
 
-    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + k0);
-    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + k1);
+    float4 f0 = kernel_data_fetch(attributes_float4, desc.offset + k0);
+    float4 f1 = kernel_data_fetch(attributes_float4, desc.offset + k1);
 
 #  ifdef __RAY_DIFFERENTIALS__
     if (dx)
@@ -180,7 +180,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                 desc.offset;
-      return kernel_tex_fetch(__attributes_float4, offset);
+      return kernel_data_fetch(attributes_float4, offset);
     }
     else {
       return zero_float4();
@@ -195,15 +195,15 @@ ccl_device float curve_thickness(KernelGlobals kg, ccl_private const ShaderData
   float r = 0.0f;
 
   if (sd->type & PRIMITIVE_CURVE) {
-    KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
+    KernelCurve curve = kernel_data_fetch(curves, sd->prim);
     int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
     int k1 = k0 + 1;
 
     float4 P_curve[2];
 
     if (!(sd->type & PRIMITIVE_MOTION)) {
-      P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-      P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+      P_curve[0] = kernel_data_fetch(curve_keys, k0);
+      P_curve[1] = kernel_data_fetch(curve_keys, k1);
     }
     else {
       motion_curve_keys_linear(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
@@ -232,14 +232,14 @@ ccl_device float curve_random(KernelGlobals kg, ccl_private const ShaderData *sd
 
 ccl_device float3 curve_motion_center_location(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
-  KernelCurve curve = kernel_tex_fetch(__curves, sd->prim);
+  KernelCurve curve = kernel_data_fetch(curves, sd->prim);
   int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
   int k1 = k0 + 1;
 
   float4 P_curve[2];
 
-  P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-  P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+  P_curve[0] = kernel_data_fetch(curve_keys, k0);
+  P_curve[1] = kernel_data_fetch(curve_keys, k1);
 
   return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
 }
diff --git a/intern/cycles/kernel/geom/curve_intersect.h b/intern/cycles/kernel/geom/curve_intersect.h
index e1a1f9c02c5..97644aacaa8 100644
--- a/intern/cycles/kernel/geom/curve_intersect.h
+++ b/intern/cycles/kernel/geom/curve_intersect.h
@@ -72,7 +72,7 @@ ccl_device_inline float sqr_point_to_line_distance(const float3 PmQ0, const floa
 ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
                                           const float3 cylinder_end,
                                           const float cylinder_radius,
-                                          const float3 ray_dir,
+                                          const float3 ray_D,
                                           ccl_private float2 *t_o,
                                           ccl_private float *u0_o,
                                           ccl_private float3 *Ng0_o,
@@ -82,7 +82,7 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
   /* Calculate quadratic equation to solve. */
   const float rl = 1.0f / len(cylinder_end - cylinder_start);
   const float3 P0 = cylinder_start, dP = (cylinder_end - cylinder_start) * rl;
-  const float3 O = -P0, dO = ray_dir;
+  const float3 O = -P0, dO = ray_D;
 
   const float dOdO = dot(dO, dO);
   const float OdO = dot(dO, O);
@@ -123,7 +123,7 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
   /* Calculates u and Ng for near hit. */
   {
     *u0_o = (t0 * dOz + Oz) * rl;
-    const float3 Pr = t0 * ray_dir;
+    const float3 Pr = t0 * ray_D;
     const float3 Pl = (*u0_o) * (cylinder_end - cylinder_start) + cylinder_start;
     *Ng0_o = Pr - Pl;
   }
@@ -131,7 +131,7 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
   /* Calculates u and Ng for far hit. */
   {
     *u1_o = (t1 * dOz + Oz) * rl;
-    const float3 Pr = t1 * ray_dir;
+    const float3 Pr = t1 * ray_D;
     const float3 Pl = (*u1_o) * (cylinder_end - cylinder_start) + cylinder_start;
     *Ng1_o = Pr - Pl;
   }
@@ -141,10 +141,10 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
   return true;
 }
 
-ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, const float3 ray_dir)
+ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, const float3 ray_D)
 {
   const float3 O = -P;
-  const float3 D = ray_dir;
+  const float3 D = ray_D;
   const float ON = dot(O, N);
   const float DN = dot(D, N);
   const float min_rcp_input = 1e-18f;
@@ -155,8 +155,9 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
   return make_float2(lower, upper);
 }
 
-ccl_device bool curve_intersect_iterative(const float3 ray_dir,
-                                          ccl_private float *ray_tfar,
+ccl_device bool curve_intersect_iterative(const float3 ray_D,
+                                          const float ray_tmin,
+                                          ccl_private float *ray_tmax,
                                           const float dt,
                                           const float4 curve[4],
                                           float u,
@@ -164,7 +165,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
                                           const bool use_backfacing,
                                           ccl_private Intersection *isect)
 {
-  const float length_ray_dir = len(ray_dir);
+  const float length_ray_D = len(ray_D);
 
   /* Error of curve evaluations is proportional to largest coordinate. */
   const float4 box_min = min(min(curve[0], curve[1]), min(curve[2], curve[3]));
@@ -175,9 +176,9 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
   const float radius_max = box_max.w;
 
   for (int i = 0; i < CURVE_NUM_JACOBIAN_ITERATIONS; i++) {
-    const float3 Q = ray_dir * t;
-    const float3 dQdt = ray_dir;
-    const float Q_err = 16.0f * FLT_EPSILON * length_ray_dir * t;
+    const float3 Q = ray_D * t;
+    const float3 dQdt = ray_D;
+    const float Q_err = 16.0f * FLT_EPSILON * length_ray_D * t;
 
     const float4 P4 = catmull_rom_basis_eval(curve, u);
     const float4 dPdu4 = catmull_rom_basis_derivative(curve, u);
@@ -220,7 +221,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
     if (fabsf(f) < f_err && fabsf(g) < g_err) {
       t += dt;
-      if (!(0.0f <= t && t <= *ray_tfar)) {
+      if (!(t >= ray_tmin && t <= *ray_tmax)) {
         return false; /* Rejects NaNs */
       }
       if (!(u >= 0.0f && u <= 1.0f)) {
@@ -232,12 +233,12 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
       const float3 U = dradiusdu * R + dPdu;
       const float3 V = cross(dPdu, R);
       const float3 Ng = cross(V, U);
-      if (!use_backfacing && dot(ray_dir, Ng) > 0.0f) {
+      if (!use_backfacing && dot(ray_D, Ng) > 0.0f) {
         return false;
       }
 
       /* Record intersection. */
-      *ray_tfar = t;
+      *ray_tmax = t;
       isect->t = t;
       isect->u = u;
       isect->v = 0.0f;
@@ -248,16 +249,17 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
   return false;
 }
 
-ccl_device bool curve_intersect_recursive(const float3 ray_orig,
-                                          const float3 ray_dir,
-                                          float ray_tfar,
+ccl_device bool curve_intersect_recursive(const float3 ray_P,
+                                          const float3 ray_D,
+                                          const float ray_tmin,
+                                          float ray_tmax,
                                           float4 curve[4],
                                           ccl_private Intersection *isect)
 {
   /* Move ray closer to make intersection stable. */
   const float3 center = float4_to_float3(0.25f * (curve[0] + curve[1] + curve[2] + curve[3]));
-  const float dt = dot(center - ray_orig, ray_dir) / dot(ray_dir, ray_dir);
-  const float3 ref = ray_orig + ray_dir * dt;
+  const float dt = dot(center - ray_P, ray_D) / dot(ray_D, ray_D);
+  const float3 ref = ray_P + ray_D * dt;
   const float4 ref4 = make_float4(ref.x, ref.y, ref.z, 0.0f);
   curve[0] -= ref4;
   curve[1] -= ref4;
@@ -320,7 +322,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       valid = cylinder_intersect(float4_to_float3(P0),
                                  float4_to_float3(P3),
                                  r_outer,
-                                 ray_dir,
+                                 ray_D,
                                  &tc_outer,
                                  &u_outer0,
                                  &Ng_outer0,
@@ -331,13 +333,12 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       }
 
       /* Intersect with cap-planes. */
-      float2 tp = make_float2(-dt, ray_tfar - dt);
+      float2 tp = make_float2(ray_tmin - dt, ray_tmax - dt);
       tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
-      const float2 h0 = half_plane_intersect(
-          float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
+      const float2 h0 = half_plane_intersect(float4_to_float3(P0), float4_to_float3(dP0du), ray_D);
       tp = make_float2(max(tp.x, h0.x), min(tp.y, h0.y));
       const float2 h1 = half_plane_intersect(
-          float4_to_float3(P3), -float4_to_float3(dP3du), ray_dir);
+          float4_to_float3(P3), -float4_to_float3(dP3du), ray_D);
       tp = make_float2(max(tp.x, h1.x), min(tp.y, h1.y));
       valid = tp.x <= tp.y;
       if (!valid) {
@@ -357,7 +358,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       const bool valid_inner = cylinder_intersect(float4_to_float3(P0),
                                                   float4_to_float3(P3),
                                                   r_inner,
-                                                  ray_dir,
+                                                  ray_D,
                                                   &tc_inner,
                                                   &u_inner0,
                                                   &Ng_inner0,
@@ -367,9 +368,9 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       /* At the unstable area we subdivide deeper. */
 #  if 0
       const bool unstable0 = (!valid_inner) |
-                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner0))) < 0.3f);
+                             (fabsf(dot(normalize(ray_D), normalize(Ng_inner0))) < 0.3f);
       const bool unstable1 = (!valid_inner) |
-                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner1))) < 0.3f);
+                             (fabsf(dot(normalize(ray_D), normalize(Ng_inner1))) < 0.3f);
 #  else
       /* On the GPU appears to be a little faster if always enabled. */
       (void)valid_inner;
@@ -394,19 +395,20 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+              ray_D, ray_tmin, &ray_tmax, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
         }
         else {
           recurse = true;
         }
       }
 
-      if (valid1 && (tp1.x + dt <= ray_tfar)) {
+      const float t1 = tp1.x + dt;
+      if (valid1 && (t1 >= ray_tmin && t1 <= ray_tmax)) {
         const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+              ray_D, ray_tmin, &ray_tmax, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
         }
         else {
           recurse = true;
@@ -456,7 +458,8 @@ ccl_device_inline bool cylinder_culling_test(const float2 p1, const float2 p2, c
  * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
  * triangles gets intersected.
  */
-ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar,
+ccl_device_inline bool ribbon_intersect_quad(const float ray_tmin,
+                                             const float ray_tmax,
                                              const float3 quad_v0,
                                              const float3 quad_v1,
                                              const float3 quad_v2,
@@ -497,7 +500,7 @@ ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar,
 
   /* Perform depth test? */
   const float t = rcpDen * dot(v0, Ng);
-  if (!(0.0f <= t && t <= ray_tfar)) {
+  if (!(t >= ray_tmin && t <= ray_tmax)) {
     return false;
   }
 
@@ -515,13 +518,16 @@ ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar,
   return true;
 }
 
-ccl_device_inline void ribbon_ray_space(const float3 ray_dir, float3 ray_space[3])
+ccl_device_inline void ribbon_ray_space(const float3 ray_D,
+                                        const float ray_D_invlen,
+                                        float3 ray_space[3])
 {
-  const float3 dx0 = make_float3(0, ray_dir.z, -ray_dir.y);
-  const float3 dx1 = make_float3(-ray_dir.z, 0, ray_dir.x);
+  const float3 D = ray_D * ray_D_invlen;
+  const float3 dx0 = make_float3(0, D.z, -D.y);
+  const float3 dx1 = make_float3(-D.z, 0, D.x);
   ray_space[0] = normalize(dot(dx0, dx0) > dot(dx1, dx1) ? dx0 : dx1);
-  ray_space[1] = normalize(cross(ray_dir, ray_space[0]));
-  ray_space[2] = ray_dir;
+  ray_space[1] = normalize(cross(D, ray_space[0]));
+  ray_space[2] = D * ray_D_invlen;
 }
 
 ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
@@ -533,15 +539,17 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
 }
 
 ccl_device_inline bool ribbon_intersect(const float3 ray_org,
-                                        const float3 ray_dir,
-                                        float ray_tfar,
+                                        const float3 ray_D,
+                                        const float ray_tmin,
+                                        float ray_tmax,
                                         const int N,
                                         float4 curve[4],
                                         ccl_private Intersection *isect)
 {
   /* Transform control points into ray space. */
+  const float ray_D_invlen = 1.0f / len(ray_D);
   float3 ray_space[3];
-  ribbon_ray_space(ray_dir, ray_space);
+  ribbon_ray_space(ray_D, ray_D_invlen, ray_space);
 
   curve[0] = ribbon_to_ray_space(ray_space, ray_org, curve[0]);
   curve[1] = ribbon_to_ray_space(ray_space, ray_org, curve[1]);
@@ -555,7 +563,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
   /* Evaluate first point and radius scaled normal direction. */
   float4 p0 = catmull_rom_basis_eval(curve, 0.0f);
   float3 dp0dt = float4_to_float3(catmull_rom_basis_derivative(curve, 0.0f));
-  if (max3(fabs(dp0dt)) < eps) {
+  if (reduce_max(fabs(dp0dt)) < eps) {
     const float4 p1 = catmull_rom_basis_eval(curve, step_size);
     dp0dt = float4_to_float3(p1 - p0);
   }
@@ -570,7 +578,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
 
     /* Evaluate next point. */
     float3 dp1dt = float4_to_float3(catmull_rom_basis_derivative(curve, u + step_size));
-    dp1dt = (max3(fabs(dp1dt)) < eps) ? float4_to_float3(p1 - p0) : dp1dt;
+    dp1dt = (reduce_max(fabs(dp1dt)) < eps) ? float4_to_float3(p1 - p0) : dp1dt;
     const float3 wn1 = normalize(make_float3(dp1dt.y, -dp1dt.x, 0.0f)) * p1.w;
 
     if (valid) {
@@ -582,21 +590,21 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
 
       /* Intersect quad. */
       float vu, vv, vt;
-      bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt);
+      bool valid0 = ribbon_intersect_quad(ray_tmin, ray_tmax, lp0, lp1, up1, up0, &vu, &vv, &vt);
 
       if (valid0) {
         /* ignore self intersections */
         const float avoidance_factor = 2.0f;
         if (avoidance_factor != 0.0f) {
           float r = mix(p0.w, p1.w, vu);
-          valid0 = vt > avoidance_factor * r;
+          valid0 = vt > avoidance_factor * r * ray_D_invlen;
         }
 
         if (valid0) {
           vv = 2.0f * vv - 1.0f;
 
           /* Record intersection. */
-          ray_tfar = vt;
+          ray_tmax = vt;
           isect->t = vt;
           isect->u = u + vu * step_size;
           isect->v = vv;
@@ -614,8 +622,9 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
 
 ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
                                             ccl_private Intersection *isect,
-                                            const float3 P,
-                                            const float3 dir,
+                                            const float3 ray_P,
+                                            const float3 ray_D,
+                                            const float tmin,
                                             const float tmax,
                                             int object,
                                             int prim,
@@ -624,7 +633,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
 {
   const bool is_motion = (type & PRIMITIVE_MOTION);
 
-  KernelCurve kcurve = kernel_tex_fetch(__curves, prim);
+  KernelCurve kcurve = kernel_data_fetch(curves, prim);
 
   int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(type);
   int k1 = k0 + 1;
@@ -633,10 +642,10 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
 
   float4 curve[4];
   if (!is_motion) {
-    curve[0] = kernel_tex_fetch(__curve_keys, ka);
-    curve[1] = kernel_tex_fetch(__curve_keys, k0);
-    curve[2] = kernel_tex_fetch(__curve_keys, k1);
-    curve[3] = kernel_tex_fetch(__curve_keys, kb);
+    curve[0] = kernel_data_fetch(curve_keys, ka);
+    curve[1] = kernel_data_fetch(curve_keys, k0);
+    curve[2] = kernel_data_fetch(curve_keys, k1);
+    curve[3] = kernel_data_fetch(curve_keys, kb);
   }
   else {
     motion_curve_keys(kg, object, prim, time, ka, k0, k1, kb, curve);
@@ -645,7 +654,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
   if (type & PRIMITIVE_CURVE_RIBBON) {
     /* todo: adaptive number of subdivisions could help performance here. */
     const int subdivisions = kernel_data.bvh.curve_subdivisions;
-    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
+    if (ribbon_intersect(ray_P, ray_D, tmin, tmax, subdivisions, curve, isect)) {
       isect->prim = prim;
       isect->object = object;
       isect->type = type;
@@ -655,7 +664,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
     return false;
   }
   else {
-    if (curve_intersect_recursive(P, dir, tmax, curve, isect)) {
+    if (curve_intersect_recursive(ray_P, ray_D, tmin, tmax, curve, isect)) {
       isect->prim = prim;
       isect->object = object;
       isect->type = type;
@@ -682,7 +691,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
     D = safe_normalize_len(D, &t);
   }
 
-  KernelCurve kcurve = kernel_tex_fetch(__curves, isect_prim);
+  KernelCurve kcurve = kernel_data_fetch(curves, isect_prim);
 
   int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
   int k1 = k0 + 1;
@@ -692,10 +701,10 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
   float4 P_curve[4];
 
   if (!(sd->type & PRIMITIVE_MOTION)) {
-    P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-    P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-    P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-    P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+    P_curve[0] = kernel_data_fetch(curve_keys, ka);
+    P_curve[1] = kernel_data_fetch(curve_keys, k0);
+    P_curve[2] = kernel_data_fetch(curve_keys, k1);
+    P_curve[3] = kernel_data_fetch(curve_keys, kb);
   }
   else {
     motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
@@ -729,7 +738,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
     /* NOTE: It is possible that P will be the same as P_inside (precision issues, or very small
      * radius). In this case use the view direction to approximate the normal. */
     const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
-    const float3 N = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;
+    const float3 N = (!isequal(P, P_inside)) ? normalize(P - P_inside) : -sd->I;
 
     sd->N = N;
     sd->v = 0.0f;
@@ -750,7 +759,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
   sd->P = P;
   sd->Ng = (sd->type & PRIMITIVE_CURVE_RIBBON) ? sd->I : sd->N;
   sd->dPdv = cross(sd->dPdu, sd->Ng);
-  sd->shader = kernel_tex_fetch(__curves, sd->prim).shader_id;
+  sd->shader = kernel_data_fetch(curves, sd->prim).shader_id;
 }
 
 #endif
diff --git a/intern/cycles/kernel/geom/motion_curve.h b/intern/cycles/kernel/geom/motion_curve.h
index b5289b6dda1..448e4b95e0b 100644
--- a/intern/cycles/kernel/geom/motion_curve.h
+++ b/intern/cycles/kernel/geom/motion_curve.h
@@ -27,8 +27,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg,
 {
   if (step == numsteps) {
     /* center step: regular key location */
-    keys[0] = kernel_tex_fetch(__curve_keys, k0);
-    keys[1] = kernel_tex_fetch(__curve_keys, k1);
+    keys[0] = kernel_data_fetch(curve_keys, k0);
+    keys[1] = kernel_data_fetch(curve_keys, k1);
   }
   else {
     /* center step is not stored in this array */
@@ -37,8 +37,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg,
 
     offset += step * numkeys;
 
-    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
+    keys[0] = kernel_data_fetch(attributes_float4, offset + k0);
+    keys[1] = kernel_data_fetch(attributes_float4, offset + k1);
   }
 }
 
@@ -83,10 +83,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg,
 {
   if (step == numsteps) {
     /* center step: regular key location */
-    keys[0] = kernel_tex_fetch(__curve_keys, k0);
-    keys[1] = kernel_tex_fetch(__curve_keys, k1);
-    keys[2] = kernel_tex_fetch(__curve_keys, k2);
-    keys[3] = kernel_tex_fetch(__curve_keys, k3);
+    keys[0] = kernel_data_fetch(curve_keys, k0);
+    keys[1] = kernel_data_fetch(curve_keys, k1);
+    keys[2] = kernel_data_fetch(curve_keys, k2);
+    keys[3] = kernel_data_fetch(curve_keys, k3);
   }
   else {
     /* center step is not stored in this array */
@@ -95,10 +95,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg,
 
     offset += step * numkeys;
 
-    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
-    keys[2] = kernel_tex_fetch(__attributes_float4, offset + k2);
-    keys[3] = kernel_tex_fetch(__attributes_float4, offset + k3);
+    keys[0] = kernel_data_fetch(attributes_float4, offset + k0);
+    keys[1] = kernel_data_fetch(attributes_float4, offset + k1);
+    keys[2] = kernel_data_fetch(attributes_float4, offset + k2);
+    keys[3] = kernel_data_fetch(attributes_float4, offset + k3);
   }
 }
 
diff --git a/intern/cycles/kernel/geom/motion_point.h b/intern/cycles/kernel/geom/motion_point.h
index c1952ab090a..4916ae702ff 100644
--- a/intern/cycles/kernel/geom/motion_point.h
+++ b/intern/cycles/kernel/geom/motion_point.h
@@ -19,7 +19,7 @@ motion_point_for_step(KernelGlobals kg, int offset, int numkeys, int numsteps, i
 {
   if (step == numsteps) {
     /* center step: regular key location */
-    return kernel_tex_fetch(__points, prim);
+    return kernel_data_fetch(points, prim);
   }
   else {
     /* center step is not stored in this array */
@@ -28,7 +28,7 @@ motion_point_for_step(KernelGlobals kg, int offset, int numkeys, int numsteps, i
 
     offset += step * numkeys;
 
-    return kernel_tex_fetch(__attributes_float4, offset + prim);
+    return kernel_data_fetch(attributes_float4, offset + prim);
   }
 }
 
diff --git a/intern/cycles/kernel/geom/motion_triangle.h b/intern/cycles/kernel/geom/motion_triangle.h
index a87eb11f4f4..06308071700 100644
--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@@ -30,9 +30,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 {
   if (step == numsteps) {
     /* center step: regular vertex location */
-    verts[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
-    verts[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
-    verts[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+    verts[0] = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
+    verts[1] = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
+    verts[2] = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
   }
   else {
     /* center step not store in this array */
@@ -41,9 +41,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 
     offset += step * numverts;
 
-    verts[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
-    verts[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
-    verts[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
+    verts[0] = kernel_data_fetch(attributes_float3, offset + tri_vindex.x);
+    verts[1] = kernel_data_fetch(attributes_float3, offset + tri_vindex.y);
+    verts[2] = kernel_data_fetch(attributes_float3, offset + tri_vindex.z);
   }
 }
 
@@ -57,9 +57,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
 {
   if (step == numsteps) {
     /* center step: regular vertex location */
-    normals[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
-    normals[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
-    normals[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
+    normals[0] = kernel_data_fetch(tri_vnormal, tri_vindex.x);
+    normals[1] = kernel_data_fetch(tri_vnormal, tri_vindex.y);
+    normals[2] = kernel_data_fetch(tri_vnormal, tri_vindex.z);
   }
   else {
     /* center step is not stored in this array */
@@ -68,9 +68,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
 
     offset += step * numverts;
 
-    normals[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
-    normals[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
-    normals[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
+    normals[0] = kernel_data_fetch(attributes_float3, offset + tri_vindex.x);
+    normals[1] = kernel_data_fetch(attributes_float3, offset + tri_vindex.y);
+    normals[2] = kernel_data_fetch(attributes_float3, offset + tri_vindex.z);
   }
 }
 
@@ -92,7 +92,7 @@ ccl_device_inline void motion_triangle_vertices(
 
   /* fetch vertex coordinates */
   float3 next_verts[3];
-  uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+  uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
 
   motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
   motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
@@ -121,7 +121,7 @@ ccl_device_inline void motion_triangle_vertices_and_normals(
 
   /* Fetch vertex coordinates. */
   float3 next_verts[3];
-  uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+  uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
 
   motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
   motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
@@ -167,7 +167,7 @@ ccl_device_inline float3 motion_triangle_smooth_normal(
 
   /* fetch normals */
   float3 normals[3], next_normals[3];
-  uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
+  uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
 
   motion_triangle_normals_for_step(kg, tri_vindex, offset, numverts, numsteps, step, normals);
   motion_triangle_normals_for_step(
diff --git a/intern/cycles/kernel/geom/motion_triangle_intersect.h b/intern/cycles/kernel/geom/motion_triangle_intersect.h
index fb951fa151d..b30ee7258dc 100644
--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -27,8 +27,8 @@ ccl_device_inline float3 motion_triangle_point_from_uv(KernelGlobals kg,
                                                        const float v,
                                                        float3 verts[3])
 {
-  float w = 1.0f - u - v;
-  float3 P = u * verts[0] + v * verts[1] + w * verts[2];
+  /* This appears to give slightly better precision than interpolating with w = (1 - u - v). */
+  float3 P = verts[0] + u * (verts[1] - verts[0]) + v * (verts[2] - verts[0]);
 
   if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
     const Transform tfm = object_get_transform(kg, sd);
@@ -46,6 +46,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
                                                  ccl_private Intersection *isect,
                                                  float3 P,
                                                  float3 dir,
+                                                 float tmin,
                                                  float tmax,
                                                  float time,
                                                  uint visibility,
@@ -58,12 +59,12 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
   motion_triangle_vertices(kg, object, prim, time, verts);
   /* Ray-triangle intersection, unoptimized. */
   float t, u, v;
-  if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
+  if (ray_triangle_intersect(P, dir, tmin, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
     /* Visibility flag test. we do it here under the assumption
      * that most triangles are culled by node flags.
      */
-    if (kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
+    if (kernel_data_fetch(prim_visibility, prim_addr) & visibility)
 #endif
     {
       isect->t = t;
@@ -92,6 +93,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
                                                        int object,
                                                        int prim,
                                                        int prim_addr,
+                                                       float tmin,
                                                        float tmax,
                                                        ccl_private uint *lcg_state,
                                                        int max_hits)
@@ -101,7 +103,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
   motion_triangle_vertices(kg, object, prim, time, verts);
   /* Ray-triangle intersection, unoptimized. */
   float t, u, v;
-  if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
+  if (!ray_triangle_intersect(P, dir, tmin, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
     return false;
   }
 
diff --git a/intern/cycles/kernel/geom/motion_triangle_shader.h b/intern/cycles/kernel/geom/motion_triangle_shader.h
index 2b2bb858816..413a61b380a 100644
--- a/intern/cycles/kernel/geom/motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/motion_triangle_shader.h
@@ -31,7 +31,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals kg,
                                                       bool is_local)
 {
   /* Get shader. */
-  sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+  sd->shader = kernel_data_fetch(tri_shader, sd->prim);
   /* Get motion info. */
   /* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
    * can we de-duplicate something here?
@@ -47,7 +47,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals kg,
   kernel_assert(offset != ATTR_STD_NOT_FOUND);
   /* Fetch vertex coordinates. */
   float3 verts[3], next_verts[3];
-  uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+  uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
   motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
   motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step + 1, next_verts);
   /* Interpolate between steps. */
@@ -68,8 +68,8 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals kg,
   sd->N = Ng;
   /* Compute derivatives of P w.r.t. uv. */
 #ifdef __DPDU__
-  sd->dPdu = (verts[0] - verts[2]);
-  sd->dPdv = (verts[1] - verts[2]);
+  sd->dPdu = (verts[1] - verts[0]);
+  sd->dPdv = (verts[2] - verts[0]);
 #endif
   /* Compute smooth normal. */
   if (sd->shader & SHADER_SMOOTH_NORMAL) {
@@ -89,7 +89,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals kg,
     float u = sd->u;
     float v = sd->v;
     float w = 1.0f - u - v;
-    sd->N = (u * normals[0] + v * normals[1] + w * normals[2]);
+    sd->N = (w * normals[0] + u * normals[1] + v * normals[2]);
   }
 }
 
diff --git a/intern/cycles/kernel/geom/object.h b/intern/cycles/kernel/geom/object.h
index 3faab7fa905..14ceb636e2e 100644
--- a/intern/cycles/kernel/geom/object.h
+++ b/intern/cycles/kernel/geom/object.h
@@ -31,10 +31,10 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals kg,
                                                    enum ObjectTransform type)
 {
   if (type == OBJECT_INVERSE_TRANSFORM) {
-    return kernel_tex_fetch(__objects, object).itfm;
+    return kernel_data_fetch(objects, object).itfm;
   }
   else {
-    return kernel_tex_fetch(__objects, object).tfm;
+    return kernel_data_fetch(objects, object).tfm;
   }
 }
 
@@ -43,10 +43,10 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals kg,
 ccl_device_inline Transform lamp_fetch_transform(KernelGlobals kg, int lamp, bool inverse)
 {
   if (inverse) {
-    return kernel_tex_fetch(__lights, lamp).itfm;
+    return kernel_data_fetch(lights, lamp).itfm;
   }
   else {
-    return kernel_tex_fetch(__lights, lamp).tfm;
+    return kernel_data_fetch(lights, lamp).tfm;
   }
 }
 
@@ -57,7 +57,7 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals kg,
                                                                enum ObjectVectorTransform type)
 {
   int offset = object * OBJECT_MOTION_PASS_SIZE + (int)type;
-  return kernel_tex_fetch(__object_motion_pass, offset);
+  return kernel_data_fetch(object_motion_pass, offset);
 }
 
 /* Motion blurred object transformations */
@@ -65,9 +65,9 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals kg,
 #ifdef __OBJECT_MOTION__
 ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals kg, int object, float time)
 {
-  const uint motion_offset = kernel_tex_fetch(__objects, object).motion_offset;
-  ccl_global const DecomposedTransform *motion = &kernel_tex_fetch(__object_motion, motion_offset);
-  const uint num_steps = kernel_tex_fetch(__objects, object).numsteps * 2 + 1;
+  const uint motion_offset = kernel_data_fetch(objects, object).motion_offset;
+  ccl_global const DecomposedTransform *motion = &kernel_data_fetch(object_motion, motion_offset);
+  const uint num_steps = kernel_data_fetch(objects, object).numsteps * 2 + 1;
 
   Transform tfm;
   transform_motion_array_interpolate(&tfm, motion, num_steps, time);
@@ -80,13 +80,13 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals kg,
                                                                float time,
                                                                ccl_private Transform *itfm)
 {
-  int object_flag = kernel_tex_fetch(__object_flag, object);
+  int object_flag = kernel_data_fetch(object_flag, object);
   if (object_flag & SD_OBJECT_MOTION) {
     /* if we do motion blur */
     Transform tfm = object_fetch_transform_motion(kg, object, time);
 
     if (itfm)
-      *itfm = transform_quick_inverse(tfm);
+      *itfm = transform_inverse(tfm);
 
     return tfm;
   }
@@ -259,7 +259,7 @@ ccl_device_inline float3 object_color(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
 
-  ccl_global const KernelObject *kobject = &kernel_tex_fetch(__objects, object);
+  ccl_global const KernelObject *kobject = &kernel_data_fetch(objects, object);
   return make_float3(kobject->color[0], kobject->color[1], kobject->color[2]);
 }
 
@@ -270,7 +270,7 @@ ccl_device_inline float object_alpha(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return 0.0f;
 
-  return kernel_tex_fetch(__objects, object).alpha;
+  return kernel_data_fetch(objects, object).alpha;
 }
 
 /* Pass ID number of object */
@@ -280,7 +280,7 @@ ccl_device_inline float object_pass_id(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return 0.0f;
 
-  return kernel_tex_fetch(__objects, object).pass_id;
+  return kernel_data_fetch(objects, object).pass_id;
 }
 
 /* Lightgroup of lamp */
@@ -290,7 +290,7 @@ ccl_device_inline int lamp_lightgroup(KernelGlobals kg, int lamp)
   if (lamp == LAMP_NONE)
     return LIGHTGROUP_NONE;
 
-  return kernel_tex_fetch(__lights, lamp).lightgroup;
+  return kernel_data_fetch(lights, lamp).lightgroup;
 }
 
 /* Lightgroup of object */
@@ -300,7 +300,7 @@ ccl_device_inline int object_lightgroup(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return LIGHTGROUP_NONE;
 
-  return kernel_tex_fetch(__objects, object).lightgroup;
+  return kernel_data_fetch(objects, object).lightgroup;
 }
 
 /* Per lamp random number for shader variation */
@@ -310,7 +310,7 @@ ccl_device_inline float lamp_random_number(KernelGlobals kg, int lamp)
   if (lamp == LAMP_NONE)
     return 0.0f;
 
-  return kernel_tex_fetch(__lights, lamp).random;
+  return kernel_data_fetch(lights, lamp).random;
 }
 
 /* Per object random number for shader variation */
@@ -320,7 +320,7 @@ ccl_device_inline float object_random_number(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return 0.0f;
 
-  return kernel_tex_fetch(__objects, object).random_number;
+  return kernel_data_fetch(objects, object).random_number;
 }
 
 /* Particle ID from which this object was generated */
@@ -330,7 +330,7 @@ ccl_device_inline int object_particle_id(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return 0;
 
-  return kernel_tex_fetch(__objects, object).particle_index;
+  return kernel_data_fetch(objects, object).particle_index;
 }
 
 /* Generated texture coordinate on surface from where object was instanced */
@@ -340,7 +340,7 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
 
-  ccl_global const KernelObject *kobject = &kernel_tex_fetch(__objects, object);
+  ccl_global const KernelObject *kobject = &kernel_data_fetch(objects, object);
   return make_float3(
       kobject->dupli_generated[0], kobject->dupli_generated[1], kobject->dupli_generated[2]);
 }
@@ -352,7 +352,7 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
 
-  ccl_global const KernelObject *kobject = &kernel_tex_fetch(__objects, object);
+  ccl_global const KernelObject *kobject = &kernel_data_fetch(objects, object);
   return make_float3(kobject->dupli_uv[0], kobject->dupli_uv[1], 0.0f);
 }
 
@@ -365,13 +365,13 @@ ccl_device_inline void object_motion_info(KernelGlobals kg,
                                           ccl_private int *numkeys)
 {
   if (numkeys) {
-    *numkeys = kernel_tex_fetch(__objects, object).numkeys;
+    *numkeys = kernel_data_fetch(objects, object).numkeys;
   }
 
   if (numsteps)
-    *numsteps = kernel_tex_fetch(__objects, object).numsteps;
+    *numsteps = kernel_data_fetch(objects, object).numsteps;
   if (numverts)
-    *numverts = kernel_tex_fetch(__objects, object).numverts;
+    *numverts = kernel_data_fetch(objects, object).numverts;
 }
 
 /* Offset to an objects patch map */
@@ -381,7 +381,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return 0;
 
-  return kernel_tex_fetch(__objects, object).patch_map_offset;
+  return kernel_data_fetch(objects, object).patch_map_offset;
 }
 
 /* Volume step size */
@@ -392,7 +392,7 @@ ccl_device_inline float object_volume_density(KernelGlobals kg, int object)
     return 1.0f;
   }
 
-  return kernel_tex_fetch(__objects, object).volume_density;
+  return kernel_data_fetch(objects, object).volume_density;
 }
 
 ccl_device_inline float object_volume_step_size(KernelGlobals kg, int object)
@@ -401,14 +401,14 @@ ccl_device_inline float object_volume_step_size(KernelGlobals kg, int object)
     return kernel_data.background.volume_step_size;
   }
 
-  return kernel_tex_fetch(__object_volume_step, object);
+  return kernel_data_fetch(object_volume_step, object);
 }
 
 /* Pass ID for shader */
 
 ccl_device int shader_pass_id(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
-  return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
+  return kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
 /* Cryptomatte ID */
@@ -418,7 +418,7 @@ ccl_device_inline float object_cryptomatte_id(KernelGlobals kg, int object)
   if (object == OBJECT_NONE)
     return 0.0f;
 
-  return kernel_tex_fetch(__objects, object).cryptomatte_object;
+  return kernel_data_fetch(objects, object).cryptomatte_object;
 }
 
 ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals kg, int object)
@@ -426,49 +426,49 @@ ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals kg, int object
   if (object == OBJECT_NONE)
     return 0;
 
-  return kernel_tex_fetch(__objects, object).cryptomatte_asset;
+  return kernel_data_fetch(objects, object).cryptomatte_asset;
 }
 
 /* Particle data from which object was instanced */
 
 ccl_device_inline uint particle_index(KernelGlobals kg, int particle)
 {
-  return kernel_tex_fetch(__particles, particle).index;
+  return kernel_data_fetch(particles, particle).index;
 }
 
 ccl_device float particle_age(KernelGlobals kg, int particle)
 {
-  return kernel_tex_fetch(__particles, particle).age;
+  return kernel_data_fetch(particles, particle).age;
 }
 
 ccl_device float particle_lifetime(KernelGlobals kg, int particle)
 {
-  return kernel_tex_fetch(__particles, particle).lifetime;
+  return kernel_data_fetch(particles, particle).lifetime;
 }
 
 ccl_device float particle_size(KernelGlobals kg, int particle)
 {
-  return kernel_tex_fetch(__particles, particle).size;
+  return kernel_data_fetch(particles, particle).size;
 }
 
 ccl_device float4 particle_rotation(KernelGlobals kg, int particle)
 {
-  return kernel_tex_fetch(__particles, particle).rotation;
+  return kernel_data_fetch(particles, particle).rotation;
 }
 
 ccl_device float3 particle_location(KernelGlobals kg, int particle)
 {
-  return float4_to_float3(kernel_tex_fetch(__particles, particle).location);
+  return float4_to_float3(kernel_data_fetch(particles, particle).location);
 }
 
 ccl_device float3 particle_velocity(KernelGlobals kg, int particle)
 {
-  return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity);
+  return float4_to_float3(kernel_data_fetch(particles, particle).velocity);
 }
 
 ccl_device float3 particle_angular_velocity(KernelGlobals kg, int particle)
 {
-  return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity);
+  return float4_to_float3(kernel_data_fetch(particles, particle).angular_velocity);
 }
 
 /* Object intersection in BVH */
@@ -488,127 +488,54 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline float bvh_instance_push(KernelGlobals kg,
-                                          int object,
-                                          ccl_private const Ray *ray,
-                                          ccl_private float3 *P,
-                                          ccl_private float3 *dir,
-                                          ccl_private float3 *idir)
-{
-  Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-
-  *P = transform_point(&tfm, ray->P);
-
-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  return len;
-}
-
-/* Transform ray to exit static object in BVH. */
-
-ccl_device_inline float bvh_instance_pop(KernelGlobals kg,
+ccl_device_inline void bvh_instance_push(KernelGlobals kg,
                                          int object,
                                          ccl_private const Ray *ray,
                                          ccl_private float3 *P,
                                          ccl_private float3 *dir,
-                                         ccl_private float3 *idir,
-                                         float t)
-{
-  if (t != FLT_MAX) {
-    Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-    t /= len(transform_direction(&tfm, ray->D));
-  }
-
-  *P = ray->P;
-  *dir = bvh_clamp_direction(ray->D);
-  *idir = bvh_inverse_direction(*dir);
-
-  return t;
-}
-
-/* Same as above, but returns scale factor to apply to multiple intersection distances */
-
-ccl_device_inline void bvh_instance_pop_factor(KernelGlobals kg,
-                                               int object,
-                                               ccl_private const Ray *ray,
-                                               ccl_private float3 *P,
-                                               ccl_private float3 *dir,
-                                               ccl_private float3 *idir,
-                                               ccl_private float *t_fac)
+                                         ccl_private float3 *idir)
 {
   Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-  *t_fac = 1.0f / len(transform_direction(&tfm, ray->D));
 
-  *P = ray->P;
-  *dir = bvh_clamp_direction(ray->D);
+  *P = transform_point(&tfm, ray->P);
+
+  *dir = bvh_clamp_direction(transform_direction(&tfm, ray->D));
   *idir = bvh_inverse_direction(*dir);
 }
 
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline float bvh_instance_motion_push(KernelGlobals kg,
-                                                 int object,
-                                                 ccl_private const Ray *ray,
-                                                 ccl_private float3 *P,
-                                                 ccl_private float3 *dir,
-                                                 ccl_private float3 *idir,
-                                                 ccl_private Transform *itfm)
-{
-  object_fetch_transform_motion_test(kg, object, ray->time, itfm);
-
-  *P = transform_point(itfm, ray->P);
-
-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  return len;
-}
-
-/* Transform ray to exit motion blurred object in BVH. */
-
-ccl_device_inline float bvh_instance_motion_pop(KernelGlobals kg,
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals kg,
                                                 int object,
                                                 ccl_private const Ray *ray,
                                                 ccl_private float3 *P,
                                                 ccl_private float3 *dir,
-                                                ccl_private float3 *idir,
-                                                float t,
-                                                ccl_private Transform *itfm)
+                                                ccl_private float3 *idir)
 {
-  if (t != FLT_MAX) {
-    t /= len(transform_direction(itfm, ray->D));
-  }
+  Transform tfm;
+  object_fetch_transform_motion_test(kg, object, ray->time, &tfm);
 
-  *P = ray->P;
-  *dir = bvh_clamp_direction(ray->D);
-  *idir = bvh_inverse_direction(*dir);
+  *P = transform_point(&tfm, ray->P);
 
-  return t;
+  *dir = bvh_clamp_direction(transform_direction(&tfm, ray->D));
+  *idir = bvh_inverse_direction(*dir);
 }
 
-/* Same as above, but returns scale factor to apply to multiple intersection distances */
+#endif
+
+/* Transform ray to exit static object in BVH. */
 
-ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals kg,
-                                                      int object,
-                                                      ccl_private const Ray *ray,
-                                                      ccl_private float3 *P,
-                                                      ccl_private float3 *dir,
-                                                      ccl_private float3 *idir,
-                                                      ccl_private float *t_fac,
-                                                      ccl_private Transform *itfm)
+ccl_device_inline void bvh_instance_pop(ccl_private const Ray *ray,
+                                        ccl_private float3 *P,
+                                        ccl_private float3 *dir,
+                                        ccl_private float3 *idir)
 {
-  *t_fac = 1.0f / len(transform_direction(itfm, ray->D));
   *P = ray->P;
   *dir = bvh_clamp_direction(ray->D);
   *idir = bvh_inverse_direction(*dir);
 }
 
-#endif
-
 /* TODO: This can be removed when we know if no devices will require explicit
  * address space qualifiers for this case. */
 
diff --git a/intern/cycles/kernel/geom/patch.h b/intern/cycles/kernel/geom/patch.h
index 1c63a00e30d..ec98ddf51f0 100644
--- a/intern/cycles/kernel/geom/patch.h
+++ b/intern/cycles/kernel/geom/patch.h
@@ -62,7 +62,7 @@ patch_map_find_patch(KernelGlobals kg, int object, int patch, float u, float v)
     int quadrant = patch_map_resolve_quadrant(median, &u, &v);
     kernel_assert(quadrant >= 0);
 
-    uint child = kernel_tex_fetch(__patches, node + quadrant);
+    uint child = kernel_data_fetch(patches, node + quadrant);
 
     /* is the quadrant a hole? */
     if (!(child & PATCH_MAP_NODE_IS_SET)) {
@@ -73,9 +73,9 @@ patch_map_find_patch(KernelGlobals kg, int object, int patch, float u, float v)
     uint index = child & PATCH_MAP_NODE_INDEX_MASK;
 
     if (child & PATCH_MAP_NODE_IS_LEAF) {
-      handle.array_index = kernel_tex_fetch(__patches, index + 0);
-      handle.patch_index = kernel_tex_fetch(__patches, index + 1);
-      handle.vert_index = kernel_tex_fetch(__patches, index + 2);
+      handle.array_index = kernel_data_fetch(patches, index + 0);
+      handle.patch_index = kernel_data_fetch(patches, index + 1);
+      handle.vert_index = kernel_data_fetch(patches, index + 2);
 
       return handle;
     }
@@ -189,11 +189,11 @@ ccl_device_inline int patch_eval_indices(KernelGlobals kg,
                                          int channel,
                                          int indices[PATCH_MAX_CONTROL_VERTS])
 {
-  int index_base = kernel_tex_fetch(__patches, handle->array_index + 2) + handle->vert_index;
+  int index_base = kernel_data_fetch(patches, handle->array_index + 2) + handle->vert_index;
 
   /* XXX: regular patches only */
   for (int i = 0; i < 16; i++) {
-    indices[i] = kernel_tex_fetch(__patches, index_base + i);
+    indices[i] = kernel_data_fetch(patches, index_base + i);
   }
 
   return 16;
@@ -209,7 +209,7 @@ ccl_device_inline void patch_eval_basis(KernelGlobals kg,
                                         float weights_du[PATCH_MAX_CONTROL_VERTS],
                                         float weights_dv[PATCH_MAX_CONTROL_VERTS])
 {
-  uint patch_bits = kernel_tex_fetch(__patches, handle->patch_index + 1); /* read patch param */
+  uint patch_bits = kernel_data_fetch(patches, handle->patch_index + 1); /* read patch param */
   float d_scale = 1 << patch_eval_depth(patch_bits);
 
   bool non_quad_root = (patch_bits >> 4) & 0x1;
@@ -287,7 +287,7 @@ ccl_device float patch_eval_float(KernelGlobals kg,
     *dv = 0.0f;
 
   for (int i = 0; i < num_control; i++) {
-    float v = kernel_tex_fetch(__attributes_float, offset + indices[i]);
+    float v = kernel_data_fetch(attributes_float, offset + indices[i]);
 
     val += v * weights[i];
     if (du)
@@ -324,7 +324,7 @@ ccl_device float2 patch_eval_float2(KernelGlobals kg,
     *dv = make_float2(0.0f, 0.0f);
 
   for (int i = 0; i < num_control; i++) {
-    float2 v = kernel_tex_fetch(__attributes_float2, offset + indices[i]);
+    float2 v = kernel_data_fetch(attributes_float2, offset + indices[i]);
 
     val += v * weights[i];
     if (du)
@@ -361,7 +361,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals kg,
     *dv = make_float3(0.0f, 0.0f, 0.0f);
 
   for (int i = 0; i < num_control; i++) {
-    float3 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
+    float3 v = kernel_data_fetch(attributes_float3, offset + indices[i]);
 
     val += v * weights[i];
     if (du)
@@ -398,7 +398,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals kg,
     *dv = zero_float4();
 
   for (int i = 0; i < num_control; i++) {
-    float4 v = kernel_tex_fetch(__attributes_float4, offset + indices[i]);
+    float4 v = kernel_data_fetch(attributes_float4, offset + indices[i]);
 
     val += v * weights[i];
     if (du)
@@ -436,7 +436,7 @@ ccl_device float4 patch_eval_uchar4(KernelGlobals kg,
 
   for (int i = 0; i < num_control; i++) {
     float4 v = color_srgb_to_linear_v4(
-        color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, offset + indices[i])));
+        color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, offset + indices[i])));
 
     val += v * weights[i];
     if (du)
diff --git a/intern/cycles/kernel/geom/point.h b/intern/cycles/kernel/geom/point.h
index ee7eca9e0c6..726d829c329 100644
--- a/intern/cycles/kernel/geom/point.h
+++ b/intern/cycles/kernel/geom/point.h
@@ -26,7 +26,7 @@ ccl_device float point_attribute_float(KernelGlobals kg,
 #  endif
 
   if (desc.element == ATTR_ELEMENT_VERTEX) {
-    return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
+    return kernel_data_fetch(attributes_float, desc.offset + sd->prim);
   }
   else {
     return 0.0f;
@@ -47,7 +47,7 @@ ccl_device float2 point_attribute_float2(KernelGlobals kg,
 #  endif
 
   if (desc.element == ATTR_ELEMENT_VERTEX) {
-    return kernel_tex_fetch(__attributes_float2, desc.offset + sd->prim);
+    return kernel_data_fetch(attributes_float2, desc.offset + sd->prim);
   }
   else {
     return make_float2(0.0f, 0.0f);
@@ -68,7 +68,7 @@ ccl_device float3 point_attribute_float3(KernelGlobals kg,
 #  endif
 
   if (desc.element == ATTR_ELEMENT_VERTEX) {
-    return kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim);
+    return kernel_data_fetch(attributes_float3, desc.offset + sd->prim);
   }
   else {
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -89,7 +89,7 @@ ccl_device float4 point_attribute_float4(KernelGlobals kg,
 #  endif
 
   if (desc.element == ATTR_ELEMENT_VERTEX) {
-    return kernel_tex_fetch(__attributes_float4, desc.offset + sd->prim);
+    return kernel_data_fetch(attributes_float4, desc.offset + sd->prim);
   }
   else {
     return zero_float4();
@@ -104,7 +104,7 @@ ccl_device float3 point_position(KernelGlobals kg, ccl_private const ShaderData
     /* World space center. */
     float3 P = (sd->type & PRIMITIVE_MOTION) ?
                    float4_to_float3(motion_point(kg, sd->object, sd->prim, sd->time)) :
-                   float4_to_float3(kernel_tex_fetch(__points, sd->prim));
+                   float4_to_float3(kernel_data_fetch(points, sd->prim));
 
     if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
       object_position_transform(kg, sd, &P);
@@ -122,7 +122,7 @@ ccl_device float point_radius(KernelGlobals kg, ccl_private const ShaderData *sd
 {
   if (sd->type & PRIMITIVE_POINT) {
     /* World space radius. */
-    const float r = kernel_tex_fetch(__points, sd->prim).w;
+    const float r = kernel_data_fetch(points, sd->prim).w;
 
     if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
       return r;
@@ -155,7 +155,7 @@ ccl_device float point_random(KernelGlobals kg, ccl_private const ShaderData *sd
 
 ccl_device float3 point_motion_center_location(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
-  return float4_to_float3(kernel_tex_fetch(__points, sd->prim));
+  return float4_to_float3(kernel_data_fetch(points, sd->prim));
 }
 
 #endif /* __POINTCLOUD__ */
diff --git a/intern/cycles/kernel/geom/point_intersect.h b/intern/cycles/kernel/geom/point_intersect.h
index c7ae72bb488..15fb814c58d 100644
--- a/intern/cycles/kernel/geom/point_intersect.h
+++ b/intern/cycles/kernel/geom/point_intersect.h
@@ -9,17 +9,21 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __POINTCLOUD__
 
-ccl_device_forceinline bool point_intersect_test(
-    const float4 point, const float3 P, const float3 dir, const float tmax, ccl_private float *t)
+ccl_device_forceinline bool point_intersect_test(const float4 point,
+                                                 const float3 ray_P,
+                                                 const float3 ray_D,
+                                                 const float ray_tmin,
+                                                 const float ray_tmax,
+                                                 ccl_private float *t)
 {
   const float3 center = float4_to_float3(point);
   const float radius = point.w;
 
-  const float rd2 = 1.0f / dot(dir, dir);
+  const float rd2 = 1.0f / dot(ray_D, ray_D);
 
-  const float3 c0 = center - P;
-  const float projC0 = dot(c0, dir) * rd2;
-  const float3 perp = c0 - projC0 * dir;
+  const float3 c0 = center - ray_P;
+  const float projC0 = dot(c0, ray_D) * rd2;
+  const float3 perp = c0 - projC0 * ray_D;
   const float l2 = dot(perp, perp);
   const float r2 = radius * radius;
   if (!(l2 <= r2)) {
@@ -28,12 +32,12 @@ ccl_device_forceinline bool point_intersect_test(
 
   const float td = sqrt((r2 - l2) * rd2);
   const float t_front = projC0 - td;
-  const bool valid_front = (0.0f <= t_front) & (t_front <= tmax);
+  const bool valid_front = (ray_tmin <= t_front) & (t_front <= ray_tmax);
 
   /* Always back-face culling for now. */
 #  if 0
   const float t_back = projC0 + td;
-  const bool valid_back = (0.0f <= t_back) & (t_back <= tmax);
+  const bool valid_back = (ray_tmin <= t_back) & (t_back <= ray_tmax);
 
   /* check if there is a first hit */
   const bool valid_first = valid_front | valid_back;
@@ -54,18 +58,19 @@ ccl_device_forceinline bool point_intersect_test(
 
 ccl_device_forceinline bool point_intersect(KernelGlobals kg,
                                             ccl_private Intersection *isect,
-                                            const float3 P,
-                                            const float3 dir,
-                                            const float tmax,
+                                            const float3 ray_P,
+                                            const float3 ray_D,
+                                            const float ray_tmin,
+                                            const float ray_tmax,
                                             const int object,
                                             const int prim,
                                             const float time,
                                             const int type)
 {
   const float4 point = (type & PRIMITIVE_MOTION) ? motion_point(kg, object, prim, time) :
-                                                   kernel_tex_fetch(__points, prim);
+                                                   kernel_data_fetch(points, prim);
 
-  if (!point_intersect_test(point, P, dir, tmax, &isect->t)) {
+  if (!point_intersect_test(point, ray_P, ray_D, ray_tmin, ray_tmax, &isect->t)) {
     return false;
   }
 
@@ -82,7 +87,7 @@ ccl_device_inline void point_shader_setup(KernelGlobals kg,
                                           ccl_private const Intersection *isect,
                                           ccl_private const Ray *ray)
 {
-  sd->shader = kernel_tex_fetch(__points_shader, isect->prim);
+  sd->shader = kernel_data_fetch(points_shader, isect->prim);
   sd->P = ray->P + ray->D * isect->t;
 
   /* Texture coordinates, zero for now. */
@@ -94,7 +99,7 @@ ccl_device_inline void point_shader_setup(KernelGlobals kg,
   /* Compute point center for normal. */
   float3 center = float4_to_float3((isect->type & PRIMITIVE_MOTION) ?
                                        motion_point(kg, sd->object, sd->prim, sd->time) :
-                                       kernel_tex_fetch(__points, sd->prim));
+                                       kernel_data_fetch(points, sd->prim));
   if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
     object_position_transform_auto(kg, sd, &center);
   }
diff --git a/intern/cycles/kernel/geom/primitive.h b/intern/cycles/kernel/geom/primitive.h
index 9b4b61fbd84..04b04ff5985 100644
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@@ -18,14 +18,14 @@ CCL_NAMESPACE_BEGIN
  * attributes for performance, mainly for GPU performance to avoid bringing in
  * heavy volume interpolation code. */
 
-ccl_device_inline float primitive_surface_attribute_float(KernelGlobals kg,
-                                                          ccl_private const ShaderData *sd,
-                                                          const AttributeDescriptor desc,
-                                                          ccl_private float *dx,
-                                                          ccl_private float *dy)
+ccl_device_forceinline float primitive_surface_attribute_float(KernelGlobals kg,
+                                                               ccl_private const ShaderData *sd,
+                                                               const AttributeDescriptor desc,
+                                                               ccl_private float *dx,
+                                                               ccl_private float *dy)
 {
   if (sd->type & PRIMITIVE_TRIANGLE) {
-    if (subd_triangle_patch(kg, sd) == ~0)
+    if (subd_triangle_patch(kg, sd->prim) == ~0)
       return triangle_attribute_float(kg, sd, desc, dx, dy);
     else
       return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
@@ -49,14 +49,14 @@ ccl_device_inline float primitive_surface_attribute_float(KernelGlobals kg,
   }
 }
 
-ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals kg,
-                                                            ccl_private const ShaderData *sd,
-                                                            const AttributeDescriptor desc,
-                                                            ccl_private float2 *dx,
-                                                            ccl_private float2 *dy)
+ccl_device_forceinline float2 primitive_surface_attribute_float2(KernelGlobals kg,
+                                                                 ccl_private const ShaderData *sd,
+                                                                 const AttributeDescriptor desc,
+                                                                 ccl_private float2 *dx,
+                                                                 ccl_private float2 *dy)
 {
   if (sd->type & PRIMITIVE_TRIANGLE) {
-    if (subd_triangle_patch(kg, sd) == ~0)
+    if (subd_triangle_patch(kg, sd->prim) == ~0)
       return triangle_attribute_float2(kg, sd, desc, dx, dy);
     else
       return subd_triangle_attribute_float2(kg, sd, desc, dx, dy);
@@ -80,14 +80,14 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals kg,
   }
 }
 
-ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals kg,
-                                                            ccl_private const ShaderData *sd,
-                                                            const AttributeDescriptor desc,
-                                                            ccl_private float3 *dx,
-                                                            ccl_private float3 *dy)
+ccl_device_forceinline float3 primitive_surface_attribute_float3(KernelGlobals kg,
+                                                                 ccl_private const ShaderData *sd,
+                                                                 const AttributeDescriptor desc,
+                                                                 ccl_private float3 *dx,
+                                                                 ccl_private float3 *dy)
 {
   if (sd->type & PRIMITIVE_TRIANGLE) {
-    if (subd_triangle_patch(kg, sd) == ~0)
+    if (subd_triangle_patch(kg, sd->prim) == ~0)
       return triangle_attribute_float3(kg, sd, desc, dx, dy);
     else
       return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
@@ -118,7 +118,7 @@ ccl_device_forceinline float4 primitive_surface_attribute_float4(KernelGlobals k
                                                                  ccl_private float4 *dy)
 {
   if (sd->type & PRIMITIVE_TRIANGLE) {
-    if (subd_triangle_patch(kg, sd) == ~0)
+    if (subd_triangle_patch(kg, sd->prim) == ~0)
       return triangle_attribute_float4(kg, sd, desc, dx, dy);
     else
       return subd_triangle_attribute_float4(kg, sd, desc, dx, dy);
@@ -149,15 +149,15 @@ ccl_device_forceinline float4 primitive_surface_attribute_float4(KernelGlobals k
  * attributes for performance, mainly for GPU performance to avoid bringing in
  * heavy volume interpolation code. */
 
-ccl_device_inline bool primitive_is_volume_attribute(ccl_private const ShaderData *sd,
-                                                     const AttributeDescriptor desc)
+ccl_device_forceinline bool primitive_is_volume_attribute(ccl_private const ShaderData *sd,
+                                                          const AttributeDescriptor desc)
 {
   return sd->type == PRIMITIVE_VOLUME;
 }
 
-ccl_device_inline float primitive_volume_attribute_float(KernelGlobals kg,
-                                                         ccl_private const ShaderData *sd,
-                                                         const AttributeDescriptor desc)
+ccl_device_forceinline float primitive_volume_attribute_float(KernelGlobals kg,
+                                                              ccl_private const ShaderData *sd,
+                                                              const AttributeDescriptor desc)
 {
   if (primitive_is_volume_attribute(sd, desc)) {
     return volume_attribute_value_to_float(volume_attribute_float4(kg, sd, desc));
@@ -167,9 +167,9 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals kg,
   }
 }
 
-ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals kg,
-                                                           ccl_private const ShaderData *sd,
-                                                           const AttributeDescriptor desc)
+ccl_device_forceinline float3 primitive_volume_attribute_float3(KernelGlobals kg,
+                                                                ccl_private const ShaderData *sd,
+                                                                const AttributeDescriptor desc)
 {
   if (primitive_is_volume_attribute(sd, desc)) {
     return volume_attribute_value_to_float3(volume_attribute_float4(kg, sd, desc));
@@ -179,9 +179,9 @@ ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals kg,
   }
 }
 
-ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals kg,
-                                                           ccl_private const ShaderData *sd,
-                                                           const AttributeDescriptor desc)
+ccl_device_forceinline float4 primitive_volume_attribute_float4(KernelGlobals kg,
+                                                                ccl_private const ShaderData *sd,
+                                                                const AttributeDescriptor desc)
 {
   if (primitive_is_volume_attribute(sd, desc)) {
     return volume_attribute_float4(kg, sd, desc);
@@ -194,7 +194,7 @@ ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals kg,
 
 /* Default UV coordinate */
 
-ccl_device_inline float3 primitive_uv(KernelGlobals kg, ccl_private const ShaderData *sd)
+ccl_device_forceinline float3 primitive_uv(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
   const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV);
 
@@ -262,8 +262,8 @@ ccl_device float3 primitive_tangent(KernelGlobals kg, ccl_private ShaderData *sd
 
 /* Motion vector for motion pass */
 
-ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
-                                                 ccl_private const ShaderData *sd)
+ccl_device_forceinline float4 primitive_motion_vector(KernelGlobals kg,
+                                                      ccl_private const ShaderData *sd)
 {
   /* center position */
   float3 center;
@@ -320,7 +320,7 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
 #endif
         if (sd->type & PRIMITIVE_TRIANGLE) {
       /* Triangle */
-      if (subd_triangle_patch(kg, sd) == ~0) {
+      if (subd_triangle_patch(kg, sd->prim) == ~0) {
         motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
         desc.offset += numverts;
         motion_post = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
diff --git a/intern/cycles/kernel/geom/shader_data.h b/intern/cycles/kernel/geom/shader_data.h
index 7a439da427a..b67d19365a3 100644
--- a/intern/cycles/kernel/geom/shader_data.h
+++ b/intern/cycles/kernel/geom/shader_data.h
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include "kernel/util/differential.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* ShaderData setup from incoming ray */
@@ -18,7 +20,7 @@ ccl_device void shader_setup_object_transforms(KernelGlobals kg,
 {
   if (sd->object_flag & SD_OBJECT_MOTION) {
     sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
-    sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+    sd->ob_itfm_motion = transform_inverse(sd->ob_tfm_motion);
   }
 }
 #endif
@@ -40,7 +42,7 @@ ccl_device_inline void shader_setup_from_ray(KernelGlobals kg,
   sd->ray_length = isect->t;
   sd->type = isect->type;
   sd->object = isect->object;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
+  sd->object_flag = kernel_data_fetch(object_flag, sd->object);
   sd->prim = isect->prim;
   sd->lamp = LAMP_NONE;
   sd->flag = 0;
@@ -73,7 +75,7 @@ ccl_device_inline void shader_setup_from_ray(KernelGlobals kg,
     if (sd->type == PRIMITIVE_TRIANGLE) {
       /* static triangle */
       float3 Ng = triangle_normal(kg, sd);
-      sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+      sd->shader = kernel_data_fetch(tri_shader, sd->prim);
 
       /* vectors */
       sd->P = triangle_point_from_uv(kg, sd, isect->object, isect->prim, isect->u, isect->v);
@@ -106,7 +108,7 @@ ccl_device_inline void shader_setup_from_ray(KernelGlobals kg,
     }
   }
 
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->flag = kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).flags;
 
   /* backfacing test */
   bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
@@ -123,9 +125,9 @@ ccl_device_inline void shader_setup_from_ray(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
   /* differentials */
-  differential_transfer_compact(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, sd->ray_length);
-  differential_incoming_compact(&sd->dI, ray->D, ray->dD);
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+  sd->dP = differential_transfer_compact(ray->dP, ray->D, ray->dD, sd->ray_length);
+  sd->dI = differential_incoming_compact(ray->dD);
+  differential_dudv_compact(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
 }
 
@@ -169,10 +171,10 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals kg,
   sd->time = time;
   sd->ray_length = t;
 
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->flag = kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).flags;
   sd->object_flag = 0;
   if (sd->object != OBJECT_NONE) {
-    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
+    sd->object_flag |= kernel_data_fetch(object_flag, sd->object);
 
 #ifdef __OBJECT_MOTION__
     shader_setup_object_transforms(kg, sd, time);
@@ -240,8 +242,8 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
   /* no ray differentials here yet */
-  sd->dP = differential3_zero();
-  sd->dI = differential3_zero();
+  sd->dP = differential_zero_compact();
+  sd->dI = differential_zero_compact();
   sd->du = differential_zero();
   sd->dv = differential_zero();
 #endif
@@ -264,21 +266,20 @@ ccl_device void shader_setup_from_displace(KernelGlobals kg,
   /* force smooth shading for displacement */
   shader |= SHADER_SMOOTH_NORMAL;
 
-  shader_setup_from_sample(
-      kg,
-      sd,
-      P,
-      Ng,
-      I,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      0.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
+  shader_setup_from_sample(kg,
+                           sd,
+                           P,
+                           Ng,
+                           I,
+                           shader,
+                           object,
+                           prim,
+                           u,
+                           v,
+                           0.0f,
+                           0.5f,
+                           !(kernel_data_fetch(object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
+                           LAMP_NONE);
 }
 
 /* ShaderData setup for point on curve. */
@@ -300,18 +301,18 @@ ccl_device void shader_setup_from_curve(KernelGlobals kg,
   sd->ray_length = 0.0f;
 
   /* Shader */
-  sd->shader = kernel_tex_fetch(__curves, prim).shader_id;
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->shader = kernel_data_fetch(curves, prim).shader_id;
+  sd->flag = kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).flags;
 
   /* Object */
   sd->object = object;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
+  sd->object_flag = kernel_data_fetch(object_flag, sd->object);
 #ifdef __OBJECT_MOTION__
   shader_setup_object_transforms(kg, sd, sd->time);
 #endif
 
   /* Get control points. */
-  KernelCurve kcurve = kernel_tex_fetch(__curves, prim);
+  KernelCurve kcurve = kernel_data_fetch(curves, prim);
 
   int k0 = kcurve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
   int k1 = k0 + 1;
@@ -320,10 +321,10 @@ ccl_device void shader_setup_from_curve(KernelGlobals kg,
 
   float4 P_curve[4];
 
-  P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-  P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-  P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-  P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+  P_curve[0] = kernel_data_fetch(curve_keys, ka);
+  P_curve[1] = kernel_data_fetch(curve_keys, k0);
+  P_curve[2] = kernel_data_fetch(curve_keys, k1);
+  P_curve[3] = kernel_data_fetch(curve_keys, kb);
 
   /* Interpolate position and tangent. */
   sd->P = float4_to_float3(catmull_rom_basis_derivative(P_curve, sd->u));
@@ -349,8 +350,8 @@ ccl_device void shader_setup_from_curve(KernelGlobals kg,
 
   /* No ray differentials currently. */
 #ifdef __RAY_DIFFERENTIALS__
-  sd->dP = differential3_zero();
-  sd->dI = differential3_zero();
+  sd->dP = differential_zero_compact();
+  sd->dI = differential_zero_compact();
   sd->du = differential_zero();
   sd->dv = differential_zero();
 #endif
@@ -373,7 +374,7 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals kg,
   sd->Ng = -ray_D;
   sd->I = -ray_D;
   sd->shader = kernel_data.background.surface_shader;
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->flag = kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).flags;
   sd->object_flag = 0;
   sd->time = ray_time;
   sd->ray_length = 0.0f;
@@ -392,8 +393,8 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
   /* differentials */
-  sd->dP = differential3_zero(); /* TODO: ray->dP */
-  differential_incoming(&sd->dI, sd->dP);
+  sd->dP = differential_zero_compact(); /* TODO: ray->dP */
+  sd->dI = differential_zero_compact();
   sd->du = differential_zero();
   sd->dv = differential_zero();
 #endif
@@ -408,7 +409,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals kg,
 {
 
   /* vectors */
-  sd->P = ray->P;
+  sd->P = ray->P + ray->D * ray->tmin;
   sd->N = -ray->D;
   sd->Ng = -ray->D;
   sd->I = -ray->D;
@@ -434,15 +435,14 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals kg,
 
 #  ifdef __RAY_DIFFERENTIALS__
   /* differentials */
-  sd->dP = differential3_zero(); /* TODO ray->dD */
-  differential_incoming(&sd->dI, sd->dP);
+  sd->dP = differential_zero_compact(); /* TODO ray->dD */
+  sd->dI = differential_zero_compact();
   sd->du = differential_zero();
   sd->dv = differential_zero();
 #  endif
 
   /* for NDC coordinates */
   sd->ray_P = ray->P;
-  sd->ray_dP = ray->dP;
 }
 #endif /* __VOLUME__ */
 
diff --git a/intern/cycles/kernel/geom/subd_triangle.h b/intern/cycles/kernel/geom/subd_triangle.h
index 24e1e454b8c..784ba377318 100644
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -13,11 +13,11 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
                                               ccl_private const ShaderData *sd,
                                               float2 uv[3])
 {
-  uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+  uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
 
-  uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
-  uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
-  uv[2] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.z);
+  uv[0] = kernel_data_fetch(tri_patch_uv, tri_vindex.x);
+  uv[1] = kernel_data_fetch(tri_patch_uv, tri_vindex.y);
+  uv[2] = kernel_data_fetch(tri_patch_uv, tri_vindex.z);
 }
 
 /* Vertex indices of patch */
@@ -26,10 +26,10 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals kg, int patch)
 {
   uint4 indices;
 
-  indices.x = kernel_tex_fetch(__patches, patch + 0);
-  indices.y = kernel_tex_fetch(__patches, patch + 1);
-  indices.z = kernel_tex_fetch(__patches, patch + 2);
-  indices.w = kernel_tex_fetch(__patches, patch + 3);
+  indices.x = kernel_data_fetch(patches, patch + 0);
+  indices.y = kernel_data_fetch(patches, patch + 1);
+  indices.z = kernel_data_fetch(patches, patch + 2);
+  indices.w = kernel_data_fetch(patches, patch + 3);
 
   return indices;
 }
@@ -38,14 +38,14 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals kg, int patch)
 
 ccl_device_inline uint subd_triangle_patch_face(KernelGlobals kg, int patch)
 {
-  return kernel_tex_fetch(__patches, patch + 4);
+  return kernel_data_fetch(patches, patch + 4);
 }
 
 /* Number of corners on originating face */
 
 ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals kg, int patch)
 {
-  return kernel_tex_fetch(__patches, patch + 5) & 0xffff;
+  return kernel_data_fetch(patches, patch + 5) & 0xffff;
 }
 
 /* Indices of the four corners that are used by the patch */
@@ -54,10 +54,10 @@ ccl_device_inline void subd_triangle_patch_corners(KernelGlobals kg, int patch,
 {
   uint4 data;
 
-  data.x = kernel_tex_fetch(__patches, patch + 4);
-  data.y = kernel_tex_fetch(__patches, patch + 5);
-  data.z = kernel_tex_fetch(__patches, patch + 6);
-  data.w = kernel_tex_fetch(__patches, patch + 7);
+  data.x = kernel_data_fetch(patches, patch + 4);
+  data.y = kernel_data_fetch(patches, patch + 5);
+  data.z = kernel_data_fetch(patches, patch + 6);
+  data.w = kernel_data_fetch(patches, patch + 7);
 
   int num_corners = data.y & 0xffff;
 
@@ -87,18 +87,18 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
                                                         ccl_private float *dx,
                                                         ccl_private float *dy)
 {
-  int patch = subd_triangle_patch(kg, sd);
+  int patch = subd_triangle_patch(kg, sd->prim);
 
 #ifdef __PATCH_EVAL__
   if (desc.flags & ATTR_SUBDIVIDED) {
     float2 uv[3];
     subd_triangle_patch_uv(kg, sd, uv);
 
-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];
 
     /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];
 
     float a, dads, dadt;
     a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -141,7 +141,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
     if (dy)
       *dy = 0.0f;
 
-    return kernel_tex_fetch(__attributes_float, desc.offset + subd_triangle_patch_face(kg, patch));
+    return kernel_data_fetch(attributes_float, desc.offset + subd_triangle_patch_face(kg, patch));
   }
   else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
     float2 uv[3];
@@ -149,10 +149,10 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
 
     uint4 v = subd_triangle_patch_indices(kg, patch);
 
-    float f0 = kernel_tex_fetch(__attributes_float, desc.offset + v.x);
-    float f1 = kernel_tex_fetch(__attributes_float, desc.offset + v.y);
-    float f2 = kernel_tex_fetch(__attributes_float, desc.offset + v.z);
-    float f3 = kernel_tex_fetch(__attributes_float, desc.offset + v.w);
+    float f0 = kernel_data_fetch(attributes_float, desc.offset + v.x);
+    float f1 = kernel_data_fetch(attributes_float, desc.offset + v.y);
+    float f2 = kernel_data_fetch(attributes_float, desc.offset + v.z);
+    float f3 = kernel_data_fetch(attributes_float, desc.offset + v.w);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -165,12 +165,12 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_CORNER) {
     float2 uv[3];
@@ -179,10 +179,10 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
     int corners[4];
     subd_triangle_patch_corners(kg, patch, corners);
 
-    float f0 = kernel_tex_fetch(__attributes_float, corners[0] + desc.offset);
-    float f1 = kernel_tex_fetch(__attributes_float, corners[1] + desc.offset);
-    float f2 = kernel_tex_fetch(__attributes_float, corners[2] + desc.offset);
-    float f3 = kernel_tex_fetch(__attributes_float, corners[3] + desc.offset);
+    float f0 = kernel_data_fetch(attributes_float, corners[0] + desc.offset);
+    float f1 = kernel_data_fetch(attributes_float, corners[1] + desc.offset);
+    float f2 = kernel_data_fetch(attributes_float, corners[2] + desc.offset);
+    float f3 = kernel_data_fetch(attributes_float, corners[3] + desc.offset);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -195,12 +195,12 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
     if (dx)
@@ -208,7 +208,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
     if (dy)
       *dy = 0.0f;
 
-    return kernel_tex_fetch(__attributes_float, desc.offset);
+    return kernel_data_fetch(attributes_float, desc.offset);
   }
   else {
     if (dx)
@@ -226,18 +226,18 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
                                                           ccl_private float2 *dx,
                                                           ccl_private float2 *dy)
 {
-  int patch = subd_triangle_patch(kg, sd);
+  int patch = subd_triangle_patch(kg, sd->prim);
 
 #ifdef __PATCH_EVAL__
   if (desc.flags & ATTR_SUBDIVIDED) {
     float2 uv[3];
     subd_triangle_patch_uv(kg, sd, uv);
 
-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];
 
     /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];
 
     float2 a, dads, dadt;
 
@@ -281,8 +281,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
     if (dy)
       *dy = make_float2(0.0f, 0.0f);
 
-    return kernel_tex_fetch(__attributes_float2,
-                            desc.offset + subd_triangle_patch_face(kg, patch));
+    return kernel_data_fetch(attributes_float2, desc.offset + subd_triangle_patch_face(kg, patch));
   }
   else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
     float2 uv[3];
@@ -290,10 +289,10 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
 
     uint4 v = subd_triangle_patch_indices(kg, patch);
 
-    float2 f0 = kernel_tex_fetch(__attributes_float2, desc.offset + v.x);
-    float2 f1 = kernel_tex_fetch(__attributes_float2, desc.offset + v.y);
-    float2 f2 = kernel_tex_fetch(__attributes_float2, desc.offset + v.z);
-    float2 f3 = kernel_tex_fetch(__attributes_float2, desc.offset + v.w);
+    float2 f0 = kernel_data_fetch(attributes_float2, desc.offset + v.x);
+    float2 f1 = kernel_data_fetch(attributes_float2, desc.offset + v.y);
+    float2 f2 = kernel_data_fetch(attributes_float2, desc.offset + v.z);
+    float2 f3 = kernel_data_fetch(attributes_float2, desc.offset + v.w);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -306,12 +305,12 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_CORNER) {
     float2 uv[3];
@@ -322,10 +321,10 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
 
     float2 f0, f1, f2, f3;
 
-    f0 = kernel_tex_fetch(__attributes_float2, corners[0] + desc.offset);
-    f1 = kernel_tex_fetch(__attributes_float2, corners[1] + desc.offset);
-    f2 = kernel_tex_fetch(__attributes_float2, corners[2] + desc.offset);
-    f3 = kernel_tex_fetch(__attributes_float2, corners[3] + desc.offset);
+    f0 = kernel_data_fetch(attributes_float2, corners[0] + desc.offset);
+    f1 = kernel_data_fetch(attributes_float2, corners[1] + desc.offset);
+    f2 = kernel_data_fetch(attributes_float2, corners[2] + desc.offset);
+    f3 = kernel_data_fetch(attributes_float2, corners[3] + desc.offset);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -338,12 +337,12 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
     if (dx)
@@ -351,7 +350,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
     if (dy)
       *dy = make_float2(0.0f, 0.0f);
 
-    return kernel_tex_fetch(__attributes_float2, desc.offset);
+    return kernel_data_fetch(attributes_float2, desc.offset);
   }
   else {
     if (dx)
@@ -369,18 +368,18 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
                                                           ccl_private float3 *dx,
                                                           ccl_private float3 *dy)
 {
-  int patch = subd_triangle_patch(kg, sd);
+  int patch = subd_triangle_patch(kg, sd->prim);
 
 #ifdef __PATCH_EVAL__
   if (desc.flags & ATTR_SUBDIVIDED) {
     float2 uv[3];
     subd_triangle_patch_uv(kg, sd, uv);
 
-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];
 
     /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];
 
     float3 a, dads, dadt;
     a = patch_eval_float3(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -423,8 +422,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
     if (dy)
       *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-    return kernel_tex_fetch(__attributes_float3,
-                            desc.offset + subd_triangle_patch_face(kg, patch));
+    return kernel_data_fetch(attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch));
   }
   else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
     float2 uv[3];
@@ -432,10 +430,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
 
     uint4 v = subd_triangle_patch_indices(kg, patch);
 
-    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
-    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
-    float3 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
-    float3 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
+    float3 f0 = kernel_data_fetch(attributes_float3, desc.offset + v.x);
+    float3 f1 = kernel_data_fetch(attributes_float3, desc.offset + v.y);
+    float3 f2 = kernel_data_fetch(attributes_float3, desc.offset + v.z);
+    float3 f3 = kernel_data_fetch(attributes_float3, desc.offset + v.w);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -448,12 +446,12 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_CORNER) {
     float2 uv[3];
@@ -464,10 +462,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
 
     float3 f0, f1, f2, f3;
 
-    f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
-    f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
-    f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
-    f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
+    f0 = kernel_data_fetch(attributes_float3, corners[0] + desc.offset);
+    f1 = kernel_data_fetch(attributes_float3, corners[1] + desc.offset);
+    f2 = kernel_data_fetch(attributes_float3, corners[2] + desc.offset);
+    f3 = kernel_data_fetch(attributes_float3, corners[3] + desc.offset);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -480,12 +478,12 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
     if (dx)
@@ -493,7 +491,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
     if (dy)
       *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_data_fetch(attributes_float3, desc.offset);
   }
   else {
     if (dx)
@@ -511,18 +509,18 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
                                                           ccl_private float4 *dx,
                                                           ccl_private float4 *dy)
 {
-  int patch = subd_triangle_patch(kg, sd);
+  int patch = subd_triangle_patch(kg, sd->prim);
 
 #ifdef __PATCH_EVAL__
   if (desc.flags & ATTR_SUBDIVIDED) {
     float2 uv[3];
     subd_triangle_patch_uv(kg, sd, uv);
 
-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];
 
     /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];
 
     float4 a, dads, dadt;
     if (desc.type == NODE_ATTR_RGBA) {
@@ -570,8 +568,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
     if (dy)
       *dy = zero_float4();
 
-    return kernel_tex_fetch(__attributes_float4,
-                            desc.offset + subd_triangle_patch_face(kg, patch));
+    return kernel_data_fetch(attributes_float4, desc.offset + subd_triangle_patch_face(kg, patch));
   }
   else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
     float2 uv[3];
@@ -579,10 +576,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
 
     uint4 v = subd_triangle_patch_indices(kg, patch);
 
-    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + v.x);
-    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + v.y);
-    float4 f2 = kernel_tex_fetch(__attributes_float4, desc.offset + v.z);
-    float4 f3 = kernel_tex_fetch(__attributes_float4, desc.offset + v.w);
+    float4 f0 = kernel_data_fetch(attributes_float4, desc.offset + v.x);
+    float4 f1 = kernel_data_fetch(attributes_float4, desc.offset + v.y);
+    float4 f2 = kernel_data_fetch(attributes_float4, desc.offset + v.z);
+    float4 f3 = kernel_data_fetch(attributes_float4, desc.offset + v.w);
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
       f1 = (f1 + f0) * 0.5f;
@@ -595,12 +592,12 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
     float2 uv[3];
@@ -613,19 +610,19 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
 
     if (desc.element == ATTR_ELEMENT_CORNER_BYTE) {
       f0 = color_srgb_to_linear_v4(
-          color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[0] + desc.offset)));
+          color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, corners[0] + desc.offset)));
       f1 = color_srgb_to_linear_v4(
-          color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[1] + desc.offset)));
+          color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, corners[1] + desc.offset)));
       f2 = color_srgb_to_linear_v4(
-          color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[2] + desc.offset)));
+          color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, corners[2] + desc.offset)));
       f3 = color_srgb_to_linear_v4(
-          color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset)));
+          color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, corners[3] + desc.offset)));
     }
     else {
-      f0 = kernel_tex_fetch(__attributes_float4, corners[0] + desc.offset);
-      f1 = kernel_tex_fetch(__attributes_float4, corners[1] + desc.offset);
-      f2 = kernel_tex_fetch(__attributes_float4, corners[2] + desc.offset);
-      f3 = kernel_tex_fetch(__attributes_float4, corners[3] + desc.offset);
+      f0 = kernel_data_fetch(attributes_float4, corners[0] + desc.offset);
+      f1 = kernel_data_fetch(attributes_float4, corners[1] + desc.offset);
+      f2 = kernel_data_fetch(attributes_float4, corners[2] + desc.offset);
+      f3 = kernel_data_fetch(attributes_float4, corners[3] + desc.offset);
     }
 
     if (subd_triangle_patch_num_corners(kg, patch) != 4) {
@@ -639,12 +636,12 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
     if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif
 
-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
   }
   else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
     if (dx)
@@ -652,7 +649,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
     if (dy)
       *dy = zero_float4();
 
-    return kernel_tex_fetch(__attributes_float4, desc.offset);
+    return kernel_data_fetch(attributes_float4, desc.offset);
   }
   else {
     if (dx)
diff --git a/intern/cycles/kernel/geom/triangle.h b/intern/cycles/kernel/geom/triangle.h
index 8ac7e67ff05..6b9450d59ef 100644
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@@ -15,10 +15,10 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderData *sd)
 {
   /* load triangle vertices */
-  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-  const float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
-  const float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
-  const float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+  const float3 v0 = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
+  const float3 v1 = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
+  const float3 v2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
 
   /* return normal */
   if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
@@ -40,15 +40,15 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
                                              ccl_private int *shader)
 {
   /* load triangle vertices */
-  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
-  float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
-  float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  float3 v0 = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
+  float3 v1 = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
+  float3 v2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
   /* compute point */
-  float t = 1.0f - u - v;
-  *P = (u * v0 + v * v1 + t * v2);
+  float w = 1.0f - u - v;
+  *P = (w * v0 + u * v1 + v * v2);
   /* get object flags */
-  int object_flag = kernel_tex_fetch(__object_flag, object);
+  int object_flag = kernel_data_fetch(object_flag, object);
   /* compute normal */
   if (object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
     *Ng = normalize(cross(v2 - v0, v1 - v0));
@@ -57,17 +57,17 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
     *Ng = normalize(cross(v1 - v0, v2 - v0));
   }
   /* shader`*/
-  *shader = kernel_tex_fetch(__tri_shader, prim);
+  *shader = kernel_data_fetch(tri_shader, prim);
 }
 
 /* Triangle vertex locations */
 
 ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3])
 {
-  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
-  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
-  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  P[0] = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
 }
 
 /* Triangle vertex locations and vertex normals */
@@ -77,13 +77,13 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg,
                                                      float3 P[3],
                                                      float3 N[3])
 {
-  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
-  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
-  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
-  N[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
-  N[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
-  N[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
+  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  P[0] = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
+  N[0] = kernel_data_fetch(tri_vnormal, tri_vindex.x);
+  N[1] = kernel_data_fetch(tri_vnormal, tri_vindex.y);
+  N[2] = kernel_data_fetch(tri_vnormal, tri_vindex.z);
 }
 
 /* Interpolate smooth vertex normal from vertices */
@@ -92,12 +92,12 @@ ccl_device_inline float3
 triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
-  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
-  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
-  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
+  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  float3 n0 = kernel_data_fetch(tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_data_fetch(tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_data_fetch(tri_vnormal, tri_vindex.z);
 
-  float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1);
+  float3 N = safe_normalize((1.0f - u - v) * n0 + u * n1 + v * n2);
 
   return is_zero(N) ? Ng : N;
 }
@@ -106,10 +106,10 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
     KernelGlobals kg, ccl_private const ShaderData *sd, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
-  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
-  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
-  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
+  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  float3 n0 = kernel_data_fetch(tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_data_fetch(tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_data_fetch(tri_vnormal, tri_vindex.z);
 
   /* ensure that the normals are in object space */
   if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
@@ -118,7 +118,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
     object_inverse_normal_transform(kg, sd, &n2);
   }
 
-  float3 N = (1.0f - u - v) * n2 + u * n0 + v * n1;
+  float3 N = (1.0f - u - v) * n0 + u * n1 + v * n2;
 
   return is_zero(N) ? Ng : N;
 }
@@ -131,14 +131,14 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
                                        ccl_private float3 *dPdv)
 {
   /* fetch triangle vertex coordinates */
-  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  const float3 p0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
-  const float3 p1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
-  const float3 p2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  const uint4 tri_vindex = kernel_data_fetch(tri_vindex, prim);
+  const float3 p0 = kernel_data_fetch(tri_verts, tri_vindex.w + 0);
+  const float3 p1 = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
+  const float3 p2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
 
   /* compute derivatives of P w.r.t. uv */
-  *dPdu = (p0 - p2);
-  *dPdv = (p1 - p2);
+  *dPdu = (p1 - p0);
+  *dPdv = (p2 - p0);
 }
 
 /* Reading attributes on various triangle elements */
@@ -153,26 +153,26 @@ ccl_device float triangle_attribute_float(KernelGlobals kg,
     float f0, f1, f2;
 
     if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
+      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      f0 = kernel_data_fetch(attributes_float, desc.offset + tri_vindex.x);
+      f1 = kernel_data_fetch(attributes_float, desc.offset + tri_vindex.y);
+      f2 = kernel_data_fetch(attributes_float, desc.offset + tri_vindex.z);
     }
     else {
       const int tri = desc.offset + sd->prim * 3;
-      f0 = kernel_tex_fetch(__attributes_float, tri + 0);
-      f1 = kernel_tex_fetch(__attributes_float, tri + 1);
-      f2 = kernel_tex_fetch(__attributes_float, tri + 2);
+      f0 = kernel_data_fetch(attributes_float, tri + 0);
+      f1 = kernel_data_fetch(attributes_float, tri + 1);
+      f2 = kernel_data_fetch(attributes_float, tri + 2);
     }
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
     if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif
 
-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
   }
   else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -185,7 +185,7 @@ ccl_device float triangle_attribute_float(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float, offset);
+      return kernel_data_fetch(attributes_float, offset);
     }
     else {
       return 0.0f;
@@ -203,26 +203,26 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals kg,
     float2 f0, f1, f2;
 
     if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float2, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float2, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float2, desc.offset + tri_vindex.z);
+      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      f0 = kernel_data_fetch(attributes_float2, desc.offset + tri_vindex.x);
+      f1 = kernel_data_fetch(attributes_float2, desc.offset + tri_vindex.y);
+      f2 = kernel_data_fetch(attributes_float2, desc.offset + tri_vindex.z);
     }
     else {
       const int tri = desc.offset + sd->prim * 3;
-      f0 = kernel_tex_fetch(__attributes_float2, tri + 0);
-      f1 = kernel_tex_fetch(__attributes_float2, tri + 1);
-      f2 = kernel_tex_fetch(__attributes_float2, tri + 2);
+      f0 = kernel_data_fetch(attributes_float2, tri + 0);
+      f1 = kernel_data_fetch(attributes_float2, tri + 1);
+      f2 = kernel_data_fetch(attributes_float2, tri + 2);
     }
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
     if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif
 
-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
   }
   else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -235,7 +235,7 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float2, offset);
+      return kernel_data_fetch(attributes_float2, offset);
     }
     else {
       return make_float2(0.0f, 0.0f);
@@ -253,26 +253,26 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
     float3 f0, f1, f2;
 
     if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      f0 = kernel_data_fetch(attributes_float3, desc.offset + tri_vindex.x);
+      f1 = kernel_data_fetch(attributes_float3, desc.offset + tri_vindex.y);
+      f2 = kernel_data_fetch(attributes_float3, desc.offset + tri_vindex.z);
     }
     else {
       const int tri = desc.offset + sd->prim * 3;
-      f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
-      f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
-      f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
+      f0 = kernel_data_fetch(attributes_float3, tri + 0);
+      f1 = kernel_data_fetch(attributes_float3, tri + 1);
+      f2 = kernel_data_fetch(attributes_float3, tri + 2);
     }
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
     if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif
 
-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
   }
   else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -285,7 +285,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_data_fetch(attributes_float3, offset);
     }
     else {
       return make_float3(0.0f, 0.0f, 0.0f);
@@ -304,36 +304,36 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
     float4 f0, f1, f2;
 
     if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
-      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.z);
+      const uint4 tri_vindex = kernel_data_fetch(tri_vindex, sd->prim);
+      f0 = kernel_data_fetch(attributes_float4, desc.offset + tri_vindex.x);
+      f1 = kernel_data_fetch(attributes_float4, desc.offset + tri_vindex.y);
+      f2 = kernel_data_fetch(attributes_float4, desc.offset + tri_vindex.z);
     }
     else {
       const int tri = desc.offset + sd->prim * 3;
       if (desc.element == ATTR_ELEMENT_CORNER) {
-        f0 = kernel_tex_fetch(__attributes_float4, tri + 0);
-        f1 = kernel_tex_fetch(__attributes_float4, tri + 1);
-        f2 = kernel_tex_fetch(__attributes_float4, tri + 2);
+        f0 = kernel_data_fetch(attributes_float4, tri + 0);
+        f1 = kernel_data_fetch(attributes_float4, tri + 1);
+        f2 = kernel_data_fetch(attributes_float4, tri + 2);
       }
       else {
         f0 = color_srgb_to_linear_v4(
-            color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 0)));
+            color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, tri + 0)));
         f1 = color_srgb_to_linear_v4(
-            color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 1)));
+            color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, tri + 1)));
         f2 = color_srgb_to_linear_v4(
-            color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, tri + 2)));
+            color_uchar4_to_float4(kernel_data_fetch(attributes_uchar4, tri + 2)));
       }
     }
 
 #ifdef __RAY_DIFFERENTIALS__
     if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
     if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif
 
-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
   }
   else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -346,7 +346,7 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
     if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
       const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float4, offset);
+      return kernel_data_fetch(attributes_float4, offset);
     }
     else {
       return zero_float4();
diff --git a/intern/cycles/kernel/geom/triangle_intersect.h b/intern/cycles/kernel/geom/triangle_intersect.h
index fe531e6868a..847ed22fddd 100644
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -17,23 +17,24 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
                                           ccl_private Intersection *isect,
                                           float3 P,
                                           float3 dir,
+                                          float tmin,
                                           float tmax,
                                           uint visibility,
                                           int object,
                                           int prim,
                                           int prim_addr)
 {
-  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
+               tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
+               tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
   float t, u, v;
-  if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
+  if (ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
     /* Visibility flag test. we do it here under the assumption
      * that most triangles are culled by node flags.
      */
-    if (kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
+    if (kernel_data_fetch(prim_visibility, prim_addr) & visibility)
 #endif
     {
       isect->object = object;
@@ -62,16 +63,17 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
                                                 int object,
                                                 int prim,
                                                 int prim_addr,
+                                                float tmin,
                                                 float tmax,
                                                 ccl_private uint *lcg_state,
                                                 int max_hits)
 {
-  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
+               tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
+               tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
   float t, u, v;
-  if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
+  if (!ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
     return false;
   }
 
@@ -139,13 +141,13 @@ ccl_device_inline float3 triangle_point_from_uv(KernelGlobals kg,
                                                 const float u,
                                                 const float v)
 {
-  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-  float w = 1.0f - u - v;
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, isect_prim).w;
+  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
+                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
+                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
 
-  float3 P = u * tri_a + v * tri_b + w * tri_c;
+  /* This appears to give slightly better precision than interpolating with w = (1 - u - v). */
+  float3 P = tri_a + u * (tri_b - tri_a) + v * (tri_c - tri_a);
 
   if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
     const Transform tfm = object_get_transform(kg, sd);
diff --git a/intern/cycles/kernel/geom/volume.h b/intern/cycles/kernel/geom/volume.h
index 22715dee5bf..885a420c97f 100644
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@@ -29,7 +29,7 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals kg,
   object_inverse_position_transform(kg, sd, &P);
 
   if (desc.offset != ATTR_STD_NOT_FOUND) {
-    Transform tfm = primitive_attribute_matrix(kg, sd, desc);
+    Transform tfm = primitive_attribute_matrix(kg, desc);
     P = transform_point(&tfm, P);
   }
 
@@ -62,7 +62,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
                                           const AttributeDescriptor desc)
 {
   if (desc.element & (ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
-    return kernel_tex_fetch(__attributes_float4, desc.offset);
+    return kernel_data_fetch(attributes_float4, desc.offset);
   }
   else if (desc.element == ATTR_ELEMENT_VOXEL) {
     /* todo: optimize this so we don't have to transform both here and in
diff --git a/intern/cycles/kernel/integrator/displacement_shader.h b/intern/cycles/kernel/integrator/displacement_shader.h
new file mode 100644
index 00000000000..839dfe244ac
--- /dev/null
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+/* Functions to evaluate displacement shader. */
+
+#pragma once
+
+#ifdef __SVM__
+#  include "kernel/svm/svm.h"
+#endif
+#ifdef __OSL__
+#  include "kernel/osl/osl.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+template<typename ConstIntegratorGenericState>
+ccl_device void displacement_shader_eval(KernelGlobals kg,
+                                         ConstIntegratorGenericState state,
+                                         ccl_private ShaderData *sd)
+{
+  sd->num_closure = 0;
+  sd->num_closure_left = 0;
+
+  /* this will modify sd->P */
+#ifdef __OSL__
+  if (kg->osl) {
+    OSLShader::eval_displacement(kg, state, sd);
+  }
+  else
+#endif
+  {
+#ifdef __SVM__
+    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+        kg, state, sd, NULL, 0);
+#endif
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/init_from_bake.h b/intern/cycles/kernel/integrator/init_from_bake.h
index 0db4241b6e3..eca2c0b9ffb 100644
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -5,8 +5,8 @@
 
 #include "kernel/camera/camera.h"
 
-#include "kernel/film/accumulate.h"
 #include "kernel/film/adaptive_sampling.h"
+#include "kernel/film/light_passes.h"
 
 #include "kernel/integrator/path_state.h"
 
@@ -49,7 +49,8 @@ ccl_device const float2 bake_offset_towards_center(KernelGlobals kg,
   const float3 to_center = center - P;
 
   const float3 offset_P = P + normalize(to_center) *
-                                  min(len(to_center), max(max3(fabs(P)), 1.0f) * position_offset);
+                                  min(len(to_center),
+                                      max(reduce_max(fabs(P)), 1.0f) * position_offset);
 
   /* Compute barycentric coordinates at new position. */
   const float3 v1 = tri_verts[1] - tri_verts[0];
@@ -91,12 +92,12 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   path_state_init(state, tile, x, y);
 
   /* Check whether the pixel has converged and should not be sampled anymore. */
-  if (!kernel_need_sample_pixel(kg, state, render_buffer)) {
+  if (!film_need_sample_pixel(kg, state, render_buffer)) {
     return false;
   }
 
   /* Always count the sample, even if the camera sample will reject the ray. */
-  const int sample = kernel_accum_sample(
+  const int sample = film_write_sample(
       kg, state, render_buffer, scheduled_sample, tile->sample_offset);
 
   /* Setup render buffers. */
@@ -111,8 +112,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   int prim = __float_as_uint(primitive[1]);
   if (prim == -1) {
     /* Accumulate transparency for empty pixels. */
-    kernel_accum_transparent(kg, state, 0, 1.0f, buffer);
-    return false;
+    film_write_transparent(kg, state, 0, 1.0f, buffer);
+    return true;
   }
 
   prim += kernel_data.bake.tri_offset;
@@ -120,13 +121,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
   /* Random number generator. */
   const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
 
-  float filter_x, filter_y;
-  if (sample == 0) {
-    filter_x = filter_y = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y);
-  }
+  const float2 rand_filter = (sample == 0) ? make_float2(0.5f, 0.5f) :
+                                             path_rng_2D(kg, rng_hash, sample, PRNG_FILTER);
 
   /* Initialize path state for path integration. */
   path_state_init_integrator(kg, state, sample, rng_hash);
@@ -149,18 +145,24 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
 
   /* Sub-pixel offset. */
   if (sample > 0) {
-    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
-    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
+    u = bake_clamp_mirror_repeat(u + dudx * (rand_filter.x - 0.5f) + dudy * (rand_filter.y - 0.5f),
+                                 1.0f);
+    v = bake_clamp_mirror_repeat(v + dvdx * (rand_filter.x - 0.5f) + dvdy * (rand_filter.y - 0.5f),
                                  1.0f - u);
   }
 
+  /* Convert from Blender to Cycles/Embree/OptiX barycentric convention. */
+  const float tmp = u;
+  u = v;
+  v = 1.0f - tmp - v;
+
   /* Position and normal on triangle. */
   const int object = kernel_data.bake.object_index;
   float3 P, Ng;
   int shader;
   triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
 
-  const int object_flag = kernel_tex_fetch(__object_flag, object);
+  const int object_flag = kernel_data_fetch(object_flag, object);
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
     Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
     P = transform_point_auto(&tfm, P);
@@ -173,14 +175,15 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
     Ray ray ccl_optional_struct_init;
     ray.P = zero_float3();
     ray.D = normalize(P);
-    ray.t = FLT_MAX;
+    ray.tmin = 0.0f;
+    ray.tmax = FLT_MAX;
     ray.time = 0.5f;
     ray.dP = differential_zero_compact();
     ray.dD = differential_zero_compact();
     integrator_state_write_ray(kg, state, &ray);
 
     /* Setup next kernel to execute. */
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
   }
   else {
     /* Surface baking. */
@@ -193,15 +196,15 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
     }
 
     const int shader_index = shader & SHADER_MASK;
-    const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags;
+    const int shader_flags = kernel_data_fetch(shaders, shader_index).flags;
 
     /* Fast path for position and normal passes not affected by shaders. */
     if (kernel_data.film.pass_position != PASS_UNUSED) {
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_position, P);
+      film_write_pass_float3(buffer + kernel_data.film.pass_position, P);
       return true;
     }
     else if (kernel_data.film.pass_normal != PASS_UNUSED && !(shader_flags & SD_HAS_BUMP)) {
-      kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, N);
+      film_write_pass_float3(buffer + kernel_data.film.pass_normal, N);
       return true;
     }
 
@@ -209,7 +212,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
     Ray ray ccl_optional_struct_init;
     ray.P = P + N;
     ray.D = -N;
-    ray.t = FLT_MAX;
+    ray.tmin = 0.0f;
+    ray.tmax = FLT_MAX;
     ray.time = 0.5f;
 
     /* Setup differentials. */
@@ -246,13 +250,15 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
     const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
 
     if (use_caustics) {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index);
+      integrator_path_init_sorted(
+          kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index);
     }
     else if (use_raytrace_kernel) {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
+      integrator_path_init_sorted(
+          kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
     }
     else {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index);
+      integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index);
     }
   }
 
diff --git a/intern/cycles/kernel/integrator/init_from_camera.h b/intern/cycles/kernel/integrator/init_from_camera.h
index 9fe27cdda9a..8df3e1b9fb3 100644
--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -5,8 +5,8 @@
 
 #include "kernel/camera/camera.h"
 
-#include "kernel/film/accumulate.h"
 #include "kernel/film/adaptive_sampling.h"
+#include "kernel/film/light_passes.h"
 
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/shadow_catcher.h"
@@ -23,31 +23,21 @@ ccl_device_inline void integrate_camera_sample(KernelGlobals kg,
                                                ccl_private Ray *ray)
 {
   /* Filter sampling. */
-  float filter_u, filter_v;
-
-  if (sample == 0) {
-    filter_u = 0.5f;
-    filter_v = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v);
-  }
+  const float2 rand_filter = (sample == 0) ? make_float2(0.5f, 0.5f) :
+                                             path_rng_2D(kg, rng_hash, sample, PRNG_FILTER);
 
   /* Depth of field sampling. */
-  float lens_u = 0.0f, lens_v = 0.0f;
-  if (kernel_data.cam.aperturesize > 0.0f) {
-    path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v);
-  }
+  const float2 rand_lens = (kernel_data.cam.aperturesize > 0.0f) ?
+                               path_rng_2D(kg, rng_hash, sample, PRNG_LENS) :
+                               zero_float2();
 
   /* Motion blur time sampling. */
-  float time = 0.0f;
-#ifdef __CAMERA_MOTION__
-  if (kernel_data.cam.shuttertime != -1.0f)
-    time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME);
-#endif
+  const float rand_time = (kernel_data.cam.shuttertime != -1.0f) ?
+                              path_rng_1D(kg, rng_hash, sample, PRNG_TIME) :
+                              0.0f;
 
   /* Generate camera ray. */
-  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+  camera_sample(kg, x, y, rand_filter.x, rand_filter.y, rand_lens.x, rand_lens.y, rand_time, ray);
 }
 
 /* Return false to indicate that this pixel is finished.
@@ -67,7 +57,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
   path_state_init(state, tile, x, y);
 
   /* Check whether the pixel has converged and should not be sampled anymore. */
-  if (!kernel_need_sample_pixel(kg, state, render_buffer)) {
+  if (!film_need_sample_pixel(kg, state, render_buffer)) {
     return false;
   }
 
@@ -76,7 +66,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
    * This logic allows to both count actual number of samples per pixel, and to add samples to this
    * pixel after it was converged and samples were added somewhere else (in which case the
    * `scheduled_sample` will be different from actual number of samples in this pixel). */
-  const int sample = kernel_accum_sample(
+  const int sample = film_write_sample(
       kg, state, render_buffer, scheduled_sample, tile->sample_offset);
 
   /* Initialize random number seed for path. */
@@ -86,7 +76,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
     /* Generate camera ray. */
     Ray ray;
     integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
-    if (ray.t == 0.0f) {
+    if (ray.tmax == 0.0f) {
       return true;
     }
 
@@ -100,10 +90,10 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
   /* Continue with intersect_closest kernel, optionally initializing volume
    * stack before that if the camera may be inside a volume. */
   if (kernel_data.cam.is_inside_volume) {
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
   }
   else {
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
   }
 
   return true;
diff --git a/intern/cycles/kernel/integrator/intersect_closest.h b/intern/cycles/kernel/integrator/intersect_closest.h
index 2dfac44b414..c7c3d74fa21 100644
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -5,13 +5,13 @@
 
 #include "kernel/camera/projection.h"
 
+#include "kernel/film/light_passes.h"
+
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/shadow_catcher.h"
 
 #include "kernel/light/light.h"
 
-#include "kernel/util/differential.h"
-
 #include "kernel/geom/geom.h"
 
 #include "kernel/bvh/bvh.h"
@@ -87,7 +87,7 @@ ccl_device_forceinline void integrator_split_shadow_catcher(
     return;
   }
 
-  kernel_write_shadow_catcher_bounce_data(kg, state, render_buffer);
+  film_write_shadow_catcher_bounce_data(kg, state, render_buffer);
 
   /* Mark state as having done a shadow catcher split so that it stops contributing to
    * the shadow catcher matte pass, but keeps contributing to the combined pass. */
@@ -109,37 +109,38 @@ ccl_device_forceinline void integrator_split_shadow_catcher(
     /* If using background pass, schedule background shading kernel so that we have a background
      * to alpha-over on. The background kernel will then continue the path afterwards. */
     INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
     return;
   }
 
   if (!integrator_state_volume_stack_is_empty(kg, state)) {
     /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
      * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
     return;
   }
 
   /* Continue with shading shadow catcher surface. */
   const int shader = intersection_get_shader(kg, isect);
-  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const int flags = kernel_data_fetch(shaders, shader).flags;
   const bool use_caustics = kernel_data.integrator.use_caustics &&
                             (object_flags & SD_OBJECT_CAUSTICS);
   const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
 
   if (use_caustics) {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+    integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
   }
   else if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    integrator_path_init_sorted(
+        kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
   }
   else {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
   }
 }
 
 /* Schedule next kernel to be executed after updating volume stack for shadow catcher. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume(
     KernelGlobals kg, IntegratorState state)
 {
@@ -149,27 +150,28 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
   integrator_state_read_isect(kg, state, &isect);
 
   const int shader = intersection_get_shader(kg, &isect);
-  const int flags = kernel_tex_fetch(__shaders, shader).flags;
+  const int flags = kernel_data_fetch(shaders, shader).flags;
   const int object_flags = intersection_get_object_flags(kg, &isect);
   const bool use_caustics = kernel_data.integrator.use_caustics &&
                             (object_flags & SD_OBJECT_CAUSTICS);
   const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
 
   if (use_caustics) {
-    INTEGRATOR_PATH_NEXT_SORTED(
-        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+    integrator_path_next_sorted(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
   }
   else if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_NEXT_SORTED(
-        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    integrator_path_next_sorted(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
   }
   else {
-    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    integrator_path_next_sorted(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
   }
 }
 
 /* Schedule next kernel to be executed after executing background shader for shadow catcher. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background(
     KernelGlobals kg, IntegratorState state)
 {
@@ -177,7 +179,8 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
   if (!integrator_state_volume_stack_is_empty(kg, state)) {
     /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
      * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    integrator_path_next(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
     return;
   }
 
@@ -190,7 +193,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
  *
  * Note that current_kernel is a template value since making this a variable
  * leads to poor performance with CUDA atomics. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel(
     KernelGlobals kg,
     IntegratorState state,
@@ -203,13 +206,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
   if (!integrator_state_volume_stack_is_empty(kg, state)) {
     const bool hit_surface = hit && !(isect->type & PRIMITIVE_LAMP);
     const int shader = (hit_surface) ? intersection_get_shader(kg, isect) : SHADER_NONE;
-    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+    const int flags = (hit_surface) ? kernel_data_fetch(shaders, shader).flags : 0;
 
     if (!integrator_intersect_terminate(kg, state, flags)) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
     }
     else {
-      INTEGRATOR_PATH_TERMINATE(current_kernel);
+      integrator_path_terminate(kg, state, current_kernel);
     }
     return;
   }
@@ -218,12 +221,12 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
   if (hit) {
     /* Hit a surface, continue with light or surface kernel. */
     if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
     }
     else {
       /* Hit a surface, continue with surface kernel unless terminated. */
       const int shader = intersection_get_shader(kg, isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+      const int flags = kernel_data_fetch(shaders, shader).flags;
 
       if (!integrator_intersect_terminate(kg, state, flags)) {
         const int object_flags = intersection_get_object_flags(kg, isect);
@@ -231,16 +234,16 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
                                   (object_flags & SD_OBJECT_CAUSTICS);
         const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
         if (use_caustics) {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+          integrator_path_next_sorted(
+              kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
         }
         else if (use_raytrace_kernel) {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+          integrator_path_next_sorted(
+              kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
         }
         else {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+          integrator_path_next_sorted(
+              kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
         }
 
 #ifdef __SHADOW_CATCHER__
@@ -249,13 +252,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
 #endif
       }
       else {
-        INTEGRATOR_PATH_TERMINATE(current_kernel);
+        integrator_path_terminate(kg, state, current_kernel);
       }
     }
   }
   else {
     /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
   }
 }
 
@@ -263,7 +266,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
  *
  * The logic here matches integrator_intersect_next_kernel, except that
  * volume shading and termination testing have already been done. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
     KernelGlobals kg,
     IntegratorState state,
@@ -273,29 +276,29 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
   if (isect->prim != PRIM_NONE) {
     /* Hit a surface, continue with light or surface kernel. */
     if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
       return;
     }
     else {
       /* Hit a surface, continue with surface kernel unless terminated. */
       const int shader = intersection_get_shader(kg, isect);
-      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+      const int flags = kernel_data_fetch(shaders, shader).flags;
       const int object_flags = intersection_get_object_flags(kg, isect);
       const bool use_caustics = kernel_data.integrator.use_caustics &&
                                 (object_flags & SD_OBJECT_CAUSTICS);
       const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
 
       if (use_caustics) {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+        integrator_path_next_sorted(
+            kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
       }
       else if (use_raytrace_kernel) {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+        integrator_path_next_sorted(
+            kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
       }
       else {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+        integrator_path_next_sorted(
+            kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
       }
 
 #ifdef __SHADOW_CATCHER__
@@ -307,7 +310,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
   }
   else {
     /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
     return;
   }
 }
@@ -321,7 +324,7 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
   /* Read ray from integrator state into local memory. */
   Ray ray ccl_optional_struct_init;
   integrator_state_read_ray(kg, state, &ray);
-  kernel_assert(ray.t != 0.0f);
+  kernel_assert(ray.tmax != 0.0f);
 
   const uint visibility = path_state_ray_visibility(state);
   const int last_isect_prim = INTEGRATOR_STATE(state, isect, prim);
@@ -329,12 +332,12 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
 
   /* Trick to use short AO rays to approximate indirect light at the end of the path. */
   if (path_state_ao_bounce(kg, state)) {
-    ray.t = kernel_data.integrator.ao_bounces_distance;
+    ray.tmax = kernel_data.integrator.ao_bounces_distance;
 
     if (last_isect_object != OBJECT_NONE) {
-      const float object_ao_distance = kernel_tex_fetch(__objects, last_isect_object).ao_distance;
+      const float object_ao_distance = kernel_data_fetch(objects, last_isect_object).ao_distance;
       if (object_ao_distance != 0.0f) {
-        ray.t = object_ao_distance;
+        ray.tmax = object_ao_distance;
       }
     }
   }
@@ -366,7 +369,7 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
     bool from_caustic_caster = false;
     bool from_caustic_receiver = false;
     if (!(path_flag & PATH_RAY_CAMERA) && last_isect_object != OBJECT_NONE) {
-      const int object_flags = kernel_tex_fetch(__object_flag, last_isect_object);
+      const int object_flags = kernel_data_fetch(object_flag, last_isect_object);
       from_caustic_receiver = (object_flags & SD_OBJECT_CAUSTICS_RECEIVER);
       from_caustic_caster = (object_flags & SD_OBJECT_CAUSTICS_CASTER);
     }
diff --git a/intern/cycles/kernel/integrator/intersect_shadow.h b/intern/cycles/kernel/integrator/intersect_shadow.h
index 3e746998225..25ff3d5b23f 100644
--- a/intern/cycles/kernel/integrator/intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/intersect_shadow.h
@@ -51,7 +51,7 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k
 }
 
 #ifdef __TRANSPARENT_SHADOWS__
-#  if defined(__KERNEL_CPU__)
+#  ifndef __KERNEL_GPU__
 ccl_device int shadow_intersections_compare(const void *a, const void *b)
 {
   const Intersection *isect_a = (const Intersection *)a;
@@ -162,7 +162,7 @@ ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowSt
 
   if (opaque_hit) {
     /* Hit an opaque surface, shadow path ends here. */
-    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
     return;
   }
   else {
@@ -171,7 +171,9 @@ ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowSt
      *
      * TODO: could also write to render buffer directly if no transparent shadows?
      * Could save a kernel execution for the common case. */
-    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+    integrator_shadow_path_next(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
                                 DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
     return;
   }
diff --git a/intern/cycles/kernel/integrator/intersect_subsurface.h b/intern/cycles/kernel/integrator/intersect_subsurface.h
index 0a2c4ad680d..f439d6905a0 100644
--- a/intern/cycles/kernel/integrator/intersect_subsurface.h
+++ b/intern/cycles/kernel/integrator/intersect_subsurface.h
@@ -17,7 +17,7 @@ ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorStat
   }
 #endif
 
-  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+  integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/intersect_volume_stack.h b/intern/cycles/kernel/integrator/intersect_volume_stack.h
index 49ef01dc870..c2490581e4d 100644
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -5,7 +5,6 @@
 
 #include "kernel/bvh/bvh.h"
 #include "kernel/geom/geom.h"
-#include "kernel/integrator/shader_eval.h"
 #include "kernel/integrator/volume_stack.h"
 
 CCL_NAMESPACE_BEGIN
@@ -24,7 +23,8 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 
   Ray volume_ray ccl_optional_struct_init;
   volume_ray.P = from_P;
-  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
+  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.tmax);
+  volume_ray.tmin = 0.0f;
   volume_ray.self.object = INTEGRATOR_STATE(state, isect, object);
   volume_ray.self.prim = INTEGRATOR_STATE(state, isect, prim);
   volume_ray.self.light_object = OBJECT_NONE;
@@ -37,8 +37,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
 
 #ifdef __VOLUME_RECORD_ALL__
   Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+  uint num_hits = scene_intersect_volume(kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
   if (num_hits > 0) {
     Intersection *isect = hits;
 
@@ -58,12 +57,9 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
     volume_stack_enter_exit(kg, state, stack_sd);
 
     /* Move ray forward. */
-    volume_ray.P = stack_sd->P;
+    volume_ray.tmin = intersection_t_offset(isect.t);
     volume_ray.self.object = isect.object;
     volume_ray.self.prim = isect.prim;
-    if (volume_ray.t != FLT_MAX) {
-      volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
-    }
     ++step;
   }
 #endif
@@ -82,7 +78,8 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
   /* Trace ray in random direction. Any direction works, Z up is a guess to get the
    * fewest hits. */
   volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
-  volume_ray.t = FLT_MAX;
+  volume_ray.tmin = 0.0f;
+  volume_ray.tmax = FLT_MAX;
   volume_ray.self.object = OBJECT_NONE;
   volume_ray.self.prim = PRIM_NONE;
   volume_ray.self.light_object = OBJECT_NONE;
@@ -109,8 +106,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
 
 #ifdef __VOLUME_RECORD_ALL__
   Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+  uint num_hits = scene_intersect_volume(kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
   if (num_hits > 0) {
     int enclosed_volumes[MAX_VOLUME_STACK_SIZE];
     Intersection *isect = hits;
@@ -199,7 +195,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
     }
 
     /* Move ray forward. */
-    volume_ray.P = stack_sd->P;
+    volume_ray.tmin = intersection_t_offset(isect.t);
     volume_ray.self.object = isect.object;
     volume_ray.self.prim = isect.prim;
     ++step;
@@ -222,7 +218,9 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
   }
   else {
     /* Volume stack init for camera rays, continue with intersection of camera ray. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+    integrator_path_next(kg,
+                         state,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
                          DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
   }
 }
diff --git a/intern/cycles/kernel/integrator/mnee.h b/intern/cycles/kernel/integrator/mnee.h
index ad83f82d091..a0ad7afe591 100644
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -115,7 +115,7 @@ ccl_device_forceinline void mnee_update_light_sample(KernelGlobals kg,
 {
   /* correct light sample position/direction and pdf
    * NOTE: preserve pdf in area measure */
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, ls->lamp);
+  const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ls->lamp);
 
   if (ls->type == LIGHT_POINT || ls->type == LIGHT_SPOT) {
     ls->D = normalize_len(ls->P - P, &ls->t);
@@ -137,8 +137,14 @@ ccl_device_forceinline void mnee_update_light_sample(KernelGlobals kg,
     }
   }
   else if (ls->type == LIGHT_AREA) {
+    float invarea = fabsf(klight->area.invarea);
     ls->D = normalize_len(ls->P - P, &ls->t);
-    ls->pdf = fabsf(klight->area.invarea);
+    ls->pdf = invarea;
+    if (klight->area.tan_spread > 0.f) {
+      ls->eval_fac = 0.25f * invarea;
+      ls->eval_fac *= light_spread_attenuation(
+          ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread);
+    }
   }
 
   ls->pdf *= kernel_data.integrator.pdf_lights;
@@ -154,12 +160,12 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
                                                        ccl_private const Intersection *isect,
                                                        ccl_private ShaderData *sd_vtx)
 {
-  sd_vtx->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
+  sd_vtx->object = (isect->object == OBJECT_NONE) ? kernel_data_fetch(prim_object, isect->prim) :
                                                     isect->object;
 
   sd_vtx->type = isect->type;
   sd_vtx->flag = 0;
-  sd_vtx->object_flag = kernel_tex_fetch(__object_flag, sd_vtx->object);
+  sd_vtx->object_flag = kernel_data_fetch(object_flag, sd_vtx->object);
 
   /* Matrices and time. */
   shader_setup_object_transforms(kg, sd_vtx, ray->time);
@@ -171,7 +177,7 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
   sd_vtx->u = isect->u;
   sd_vtx->v = isect->v;
 
-  sd_vtx->shader = kernel_tex_fetch(__tri_shader, sd_vtx->prim);
+  sd_vtx->shader = kernel_data_fetch(tri_shader, sd_vtx->prim);
 
   float3 verts[3];
   float3 normals[3];
@@ -180,7 +186,7 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
     triangle_vertices_and_normals(kg, sd_vtx->prim, verts, normals);
 
     /* Compute refined position (same code as in triangle_point_from_uv). */
-    sd_vtx->P = isect->u * verts[0] + isect->v * verts[1] + (1.f - isect->u - isect->v) * verts[2];
+    sd_vtx->P = (1.f - isect->u - isect->v) * verts[0] + isect->u * verts[1] + isect->v * verts[2];
     if (!(sd_vtx->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
       const Transform tfm = object_get_transform(kg, sd_vtx);
       sd_vtx->P = transform_point(&tfm, sd_vtx->P);
@@ -207,8 +213,8 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
   }
 
   /* Tangent space (position derivatives) WRT barycentric (u, v). */
-  float3 dp_du = verts[0] - verts[2];
-  float3 dp_dv = verts[1] - verts[2];
+  float3 dp_du = verts[1] - verts[0];
+  float3 dp_dv = verts[2] - verts[0];
 
   /* Geometric normal. */
   vtx->ng = normalize(cross(dp_du, dp_dv));
@@ -217,16 +223,16 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
 
   /* Shading normals: Interpolate normals between vertices. */
   float n_len;
-  vtx->n = normalize_len(normals[0] * sd_vtx->u + normals[1] * sd_vtx->v +
-                             normals[2] * (1.0f - sd_vtx->u - sd_vtx->v),
+  vtx->n = normalize_len(normals[0] * (1.0f - sd_vtx->u - sd_vtx->v) + normals[1] * sd_vtx->u +
+                             normals[2] * sd_vtx->v,
                          &n_len);
 
   /* Shading normal derivatives WRT barycentric (u, v)
    * we calculate the derivative of n = |u*n0 + v*n1 + (1-u-v)*n2| using:
    * d/du [f(u)/|f(u)|] = [d/du f(u)]/|f(u)| - f(u)/|f(u)|^3 <f(u), d/du f(u)>. */
   const float inv_n_len = 1.f / n_len;
-  float3 dn_du = inv_n_len * (normals[0] - normals[2]);
-  float3 dn_dv = inv_n_len * (normals[1] - normals[2]);
+  float3 dn_du = inv_n_len * (normals[1] - normals[0]);
+  float3 dn_dv = inv_n_len * (normals[2] - normals[0]);
   dn_du -= vtx->n * dot(vtx->n, dn_du);
   dn_dv -= vtx->n * dot(vtx->n, dn_dv);
 
@@ -386,7 +392,7 @@ ccl_device_forceinline bool mnee_compute_constraint_derivatives(
 /* Invert (block) constraint derivative matrix and solve linear system so we can map dh back to dx:
  *  dh / dx = A
  *  dx = inverse(A) x dh
- *  to use for specular specular manifold walk
+ *  to use for specular manifold walk
  * (See for example http://faculty.washington.edu/finlayso/ebook/algebraic/advanced/LUtri.htm
  *  for block tridiagonal matrix based linear system solve) */
 ccl_device_forceinline bool mnee_solve_matrix_h_to_x(int vertex_count,
@@ -436,6 +442,7 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg,
   projection_ray.self.light_prim = PRIM_NONE;
   projection_ray.dP = differential_make_compact(sd->dP);
   projection_ray.dD = differential_zero_compact();
+  projection_ray.tmin = 0.0f;
   projection_ray.time = sd->time;
   Intersection projection_isect;
 
@@ -499,8 +506,8 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg,
         projection_ray.self.prim = pv.prim;
         projection_ray.P = pv.p;
       }
-      projection_ray.D = normalize_len(tentative_p - projection_ray.P, &projection_ray.t);
-      projection_ray.t *= MNEE_PROJECTION_DISTANCE_MULTIPLIER;
+      projection_ray.D = normalize_len(tentative_p - projection_ray.P, &projection_ray.tmax);
+      projection_ray.tmax *= MNEE_PROJECTION_DISTANCE_MULTIPLIER;
 
       bool projection_success = false;
       for (int isect_count = 0; isect_count < MNEE_MAX_INTERSECTION_COUNT; isect_count++) {
@@ -509,7 +516,7 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg,
           break;
 
         int hit_object = (projection_isect.object == OBJECT_NONE) ?
-                             kernel_tex_fetch(__prim_object, projection_isect.prim) :
+                             kernel_data_fetch(prim_object, projection_isect.prim) :
                              projection_isect.object;
 
         if (hit_object == mv.object) {
@@ -519,8 +526,7 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg,
 
         projection_ray.self.object = projection_isect.object;
         projection_ray.self.prim = projection_isect.prim;
-        projection_ray.P += projection_isect.t * projection_ray.D;
-        projection_ray.t -= projection_isect.t;
+        projection_ray.tmin = intersection_t_offset(projection_isect.t);
       }
       if (!projection_success) {
         reduce_stepsize = true;
@@ -628,9 +634,9 @@ mnee_sample_bsdf_dh(ClosureType type, float alpha_x, float alpha_y, float sample
  * We assume here that the pdf (in half-vector measure) is the same as
  * the one calculation when sampling the microfacet normals from the
  * specular chain above: this allows us to simplify the bsdf weight */
-ccl_device_forceinline float3 mnee_eval_bsdf_contribution(ccl_private ShaderClosure *closure,
-                                                          float3 wi,
-                                                          float3 wo)
+ccl_device_forceinline Spectrum mnee_eval_bsdf_contribution(ccl_private ShaderClosure *closure,
+                                                            float3 wi,
+                                                            float3 wo)
 {
   ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)closure;
 
@@ -801,7 +807,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
   float3 wo = normalize_len(vertices[0].p - sd->P, &wo_len);
 
   /* Initialize throughput and evaluate receiver bsdf * |n.wo|. */
-  shader_bsdf_eval(kg, sd, wo, false, throughput, ls->shader);
+  surface_shader_bsdf_eval(kg, sd, wo, false, throughput, ls->shader);
 
   /* Update light sample with new position / direct.ion
    * and keep pdf in vertex area measure */
@@ -829,8 +835,8 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
                                                              1;
   INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + vertex_count;
 
-  float3 light_eval = light_sample_shader_eval(kg, state, sd_mnee, ls, sd->time);
-  bsdf_eval_mul3(throughput, light_eval / ls->pdf);
+  Spectrum light_eval = light_sample_shader_eval(kg, state, sd_mnee, ls, sd->time);
+  bsdf_eval_mul(throughput, light_eval / ls->pdf);
 
   /* Generalized geometry term. */
 
@@ -852,6 +858,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
   Ray probe_ray;
   probe_ray.self.light_object = ls->object;
   probe_ray.self.light_prim = ls->prim;
+  probe_ray.tmin = 0.0f;
   probe_ray.dP = differential_make_compact(sd->dP);
   probe_ray.dD = differential_zero_compact();
   probe_ray.time = sd->time;
@@ -867,13 +874,13 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
     ccl_private const ManifoldVertex &v = vertices[vi];
 
     /* Check visibility. */
-    probe_ray.D = normalize_len(v.p - probe_ray.P, &probe_ray.t);
+    probe_ray.D = normalize_len(v.p - probe_ray.P, &probe_ray.tmax);
     if (scene_intersect(kg, &probe_ray, PATH_RAY_TRANSMIT, &probe_isect)) {
       int hit_object = (probe_isect.object == OBJECT_NONE) ?
-                           kernel_tex_fetch(__prim_object, probe_isect.prim) :
+                           kernel_data_fetch(prim_object, probe_isect.prim) :
                            probe_isect.object;
       /* Test whether the ray hit the appropriate object at its intended location. */
-      if (hit_object != v.object || fabsf(probe_ray.t - probe_isect.t) > MNEE_MIN_DISTANCE)
+      if (hit_object != v.object || fabsf(probe_ray.tmax - probe_isect.t) > MNEE_MIN_DISTANCE)
         return false;
     }
     probe_ray.self.object = v.object;
@@ -906,7 +913,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
     INTEGRATOR_STATE_WRITE(state, path, bounce) = bounce + 1 + vi;
 
     /* Evaluate shader nodes at solution vi. */
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+    surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
         kg, state, sd_mnee, NULL, PATH_RAY_DIFFUSE, true);
 
     /* Set light looking dir. */
@@ -917,8 +924,8 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
     /* Evaluate product term inside eq.6 at solution interface. vi
      * divided by corresponding sampled pdf:
      * fr(vi)_do / pdf_dh(vi) x |do/dh| x |n.wo / n.h| */
-    float3 bsdf_contribution = mnee_eval_bsdf_contribution(v.bsdf, wi, wo);
-    bsdf_eval_mul3(throughput, bsdf_contribution);
+    Spectrum bsdf_contribution = mnee_eval_bsdf_contribution(v.bsdf, wi, wo);
+    bsdf_eval_mul(throughput, bsdf_contribution);
   }
 
   /* Restore original state path bounce info. */
@@ -952,15 +959,16 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
   probe_ray.self.light_object = ls->object;
   probe_ray.self.light_prim = ls->prim;
   probe_ray.P = sd->P;
+  probe_ray.tmin = 0.0f;
   if (ls->t == FLT_MAX) {
     /* Distant / env light. */
     probe_ray.D = ls->D;
-    probe_ray.t = ls->t;
+    probe_ray.tmax = ls->t;
   }
   else {
     /* Other lights, avoid self-intersection. */
     probe_ray.D = ls->P - probe_ray.P;
-    probe_ray.D = normalize_len(probe_ray.D, &probe_ray.t);
+    probe_ray.D = normalize_len(probe_ray.D, &probe_ray.tmax);
   }
   probe_ray.dP = differential_make_compact(sd->dP);
   probe_ray.dD = differential_zero_compact();
@@ -998,7 +1006,7 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
         return 0;
 
       /* Last bool argument is the MNEE flag (for TINY_MAX_CLOSURE cap in kernel_shader.h). */
-      shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+      surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
           kg, state, sd_mnee, NULL, PATH_RAY_DIFFUSE, true);
 
       /* Get and sample refraction bsdf */
@@ -1025,10 +1033,12 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
           float2 h = zero_float2();
           if (microfacet_bsdf->alpha_x > 0.f && microfacet_bsdf->alpha_y > 0.f) {
             /* Sample transmissive microfacet bsdf. */
-            float bsdf_u, bsdf_v;
-            path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-            h = mnee_sample_bsdf_dh(
-                bsdf->type, microfacet_bsdf->alpha_x, microfacet_bsdf->alpha_y, bsdf_u, bsdf_v);
+            const float2 bsdf_uv = path_state_rng_2D(kg, rng_state, PRNG_SURFACE_BSDF);
+            h = mnee_sample_bsdf_dh(bsdf->type,
+                                    microfacet_bsdf->alpha_x,
+                                    microfacet_bsdf->alpha_y,
+                                    bsdf_uv.x,
+                                    bsdf_uv.y);
           }
 
           /* Setup differential geometry on vertex. */
@@ -1042,9 +1052,7 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
 
     probe_ray.self.object = probe_isect.object;
     probe_ray.self.prim = probe_isect.prim;
-    probe_ray.P += probe_isect.t * probe_ray.D;
-    if (ls->t != FLT_MAX)
-      probe_ray.t -= probe_isect.t;
+    probe_ray.tmin = intersection_t_offset(probe_isect.t);
   };
 
   /* Mark the manifold walk invalid to keep mollification on by default. */
diff --git a/intern/cycles/kernel/integrator/path_state.h b/intern/cycles/kernel/integrator/path_state.h
index ec93ac6d46f..54560905397 100644
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -13,7 +13,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init_queues(IntegratorState state)
 {
   INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
   INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
   INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
 #endif
@@ -48,14 +48,13 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, path, volume_bounce) = 0;
   INTEGRATOR_STATE_WRITE(state, path, volume_bounds_bounce) = 0;
   INTEGRATOR_STATE_WRITE(state, path, rng_hash) = rng_hash;
-  INTEGRATOR_STATE_WRITE(state, path, rng_offset) = PRNG_BASE_NUM;
+  INTEGRATOR_STATE_WRITE(state, path, rng_offset) = PRNG_BOUNCE_NUM;
   INTEGRATOR_STATE_WRITE(state, path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
                                               PATH_RAY_TRANSPARENT_BACKGROUND;
   INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = 0.0f;
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
   INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = FLT_MAX;
   INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f;
-  INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+  INTEGRATOR_STATE_WRITE(state, path, throughput) = one_spectrum();
 
 #ifdef __MNEE__
   INTEGRATOR_STATE_WRITE(state, path, mnee) = 0;
@@ -75,7 +74,7 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
 #ifdef __DENOISING_FEATURES__
   if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
     INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_DENOISING_FEATURES;
-    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) = one_float3();
+    INTEGRATOR_STATE_WRITE(state, path, denoising_feature_throughput) = one_spectrum();
   }
 #endif
 }
@@ -250,7 +249,7 @@ ccl_device_inline float path_state_continuation_probability(KernelGlobals kg,
 
   /* Probabilistic termination: use sqrt() to roughly match typical view
    * transform and do path termination a bit later on average. */
-  return min(sqrtf(max3(fabs(INTEGRATOR_STATE(state, path, throughput)))), 1.0f);
+  return min(sqrtf(reduce_max(fabs(INTEGRATOR_STATE(state, path, throughput)))), 1.0f);
 }
 
 ccl_device_inline bool path_state_ao_bounce(KernelGlobals kg, ConstIntegratorState state)
@@ -299,38 +298,25 @@ ccl_device_inline void shadow_path_state_rng_load(ConstIntegratorShadowState sta
 
 ccl_device_inline float path_state_rng_1D(KernelGlobals kg,
                                           ccl_private const RNGState *rng_state,
-                                          int dimension)
+                                          const int dimension)
 {
   return path_rng_1D(
       kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals kg,
-                                         ccl_private const RNGState *rng_state,
-                                         int dimension,
-                                         ccl_private float *fx,
-                                         ccl_private float *fy)
+ccl_device_inline float2 path_state_rng_2D(KernelGlobals kg,
+                                           ccl_private const RNGState *rng_state,
+                                           const int dimension)
 {
-  path_rng_2D(
-      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
-}
-
-ccl_device_inline float path_state_rng_1D_hash(KernelGlobals kg,
-                                               ccl_private const RNGState *rng_state,
-                                               uint hash)
-{
-  /* Use a hash instead of dimension, this is not great but avoids adding
-   * more dimensions to each bounce which reduces quality of dimensions we
-   * are already using. */
-  return path_rng_1D(
-      kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+  return path_rng_2D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
 }
 
 ccl_device_inline float path_branched_rng_1D(KernelGlobals kg,
                                              ccl_private const RNGState *rng_state,
-                                             int branch,
-                                             int num_branches,
-                                             int dimension)
+                                             const int branch,
+                                             const int num_branches,
+                                             const int dimension)
 {
   return path_rng_1D(kg,
                      rng_state->rng_hash,
@@ -338,20 +324,16 @@ ccl_device_inline float path_branched_rng_1D(KernelGlobals kg,
                      rng_state->rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals kg,
-                                            ccl_private const RNGState *rng_state,
-                                            int branch,
-                                            int num_branches,
-                                            int dimension,
-                                            ccl_private float *fx,
-                                            ccl_private float *fy)
+ccl_device_inline float2 path_branched_rng_2D(KernelGlobals kg,
+                                              ccl_private const RNGState *rng_state,
+                                              const int branch,
+                                              const int num_branches,
+                                              const int dimension)
 {
-  path_rng_2D(kg,
-              rng_state->rng_hash,
-              rng_state->sample * num_branches + branch,
-              rng_state->rng_offset + dimension,
-              fx,
-              fy);
+  return path_rng_2D(kg,
+                     rng_state->rng_hash,
+                     rng_state->sample * num_branches + branch,
+                     rng_state->rng_offset + dimension);
 }
 
 /* Utility functions to get light termination value,
diff --git a/intern/cycles/kernel/integrator/shade_background.h b/intern/cycles/kernel/integrator/shade_background.h
index 72ecf67e8a0..30ce0999258 100644
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -3,18 +3,19 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/film/light_passes.h"
+
+#include "kernel/integrator/surface_shader.h"
+
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
-                                                    IntegratorState state,
-                                                    ccl_global float *ccl_restrict render_buffer)
+ccl_device Spectrum integrator_eval_background_shader(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __BACKGROUND__
   const int shader = kernel_data.background.surface_shader;
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
 
@@ -26,56 +27,35 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
         ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
         ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
         ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
-      return zero_float3();
+      return zero_spectrum();
   }
 
   /* Use fast constant background color if available. */
-  float3 L = zero_float3();
-  if (!shader_constant_emission_eval(kg, shader, &L)) {
-    /* Evaluate background shader. */
-
-    /* TODO: does aliasing like this break automatic SoA in CUDA?
-     * Should we instead store closures separate from ShaderData? */
-    ShaderDataTinyStorage emission_sd_storage;
-    ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-
-    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
-    shader_setup_from_background(kg,
-                                 emission_sd,
-                                 INTEGRATOR_STATE(state, ray, P),
-                                 INTEGRATOR_STATE(state, ray, D),
-                                 INTEGRATOR_STATE(state, ray, time));
-
-    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
-    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND>(
-        kg, state, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
-
-    L = shader_background_eval(emission_sd);
+  Spectrum L = zero_spectrum();
+  if (surface_shader_constant_emission(kg, shader, &L)) {
+    return L;
   }
 
-  /* Background MIS weights. */
-#  ifdef __BACKGROUND_MIS__
-  /* Check if background light exists or if we should skip pdf. */
-  if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_MIS_SKIP) &&
-      kernel_data.background.use_mis) {
-    const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
-    const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
-    const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
-
-    /* multiple importance sampling, get background light pdf for ray
-     * direction, and compute weight with respect to BSDF pdf */
-    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
-    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
-    L *= mis_weight;
-  }
-#  endif
+  /* Evaluate background shader. */
 
-  return L;
-#else
-  return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+  /* TODO: does aliasing like this break automatic SoA in CUDA?
+   * Should we instead store closures separate from ShaderData? */
+  ShaderDataTinyStorage emission_sd_storage;
+  ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+
+  PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
+  shader_setup_from_background(kg,
+                               emission_sd,
+                               INTEGRATOR_STATE(state, ray, P),
+                               INTEGRATOR_STATE(state, ray, D),
+                               INTEGRATOR_STATE(state, ray, time));
+
+  PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+  PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+  surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_BACKGROUND>(
+      kg, state, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
+
+  return surface_shader_background(emission_sd);
 }
 
 ccl_device_inline void integrate_background(KernelGlobals kg,
@@ -107,7 +87,7 @@ ccl_device_inline void integrate_background(KernelGlobals kg,
       for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
         /* This path should have been resolved with mnee, it will
          * generate a firefly for small lights since it is improbable. */
-        const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+        const ccl_global KernelLight *klight = &kernel_data_fetch(lights, lamp);
         if (klight->type == LIGHT_BACKGROUND && klight->use_caustics) {
           eval_background = false;
           break;
@@ -118,17 +98,37 @@ ccl_device_inline void integrate_background(KernelGlobals kg,
 #endif /* __MNEE__ */
 
   /* Evaluate background shader. */
-  float3 L = (eval_background) ? integrator_eval_background_shader(kg, state, render_buffer) :
-                                 zero_float3();
+  Spectrum L = zero_spectrum();
+
+  if (eval_background) {
+    L = integrator_eval_background_shader(kg, state, render_buffer);
+
+    /* When using the ao bounces approximation, adjust background
+     * shader intensity with ao factor. */
+    if (path_state_ao_bounce(kg, state)) {
+      L *= kernel_data.integrator.ao_bounces_factor;
+    }
+
+    /* Background MIS weights. */
+    float mis_weight = 1.0f;
+    /* Check if background light exists or if we should skip pdf. */
+    if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_MIS_SKIP) &&
+        kernel_data.background.use_mis) {
+      const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
+      const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
+      const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
+
+      /* multiple importance sampling, get background light pdf for ray
+       * direction, and compute weight with respect to BSDF pdf */
+      const float pdf = background_light_pdf(kg, ray_P, ray_D);
+      mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
+    }
 
-  /* When using the ao bounces approximation, adjust background
-   * shader intensity with ao factor. */
-  if (path_state_ao_bounce(kg, state)) {
-    L *= kernel_data.integrator.ao_bounces_factor;
+    L *= mis_weight;
   }
 
   /* Write to render buffer. */
-  kernel_accum_background(kg, state, L, transparent, is_transparent_background_ray, render_buffer);
+  film_write_background(kg, state, L, transparent, is_transparent_background_ray, render_buffer);
 }
 
 ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
@@ -160,7 +160,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
       if (INTEGRATOR_STATE(state, path, mnee) & PATH_MNEE_CULL_LIGHT_CONNECTION) {
         /* This path should have been resolved with mnee, it will
          * generate a firefly for small lights since it is improbable. */
-        const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+        const ccl_global KernelLight *klight = &kernel_data_fetch(lights, lamp);
         if (klight->use_caustics)
           return;
       }
@@ -170,24 +170,23 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
       /* TODO: does aliasing like this break automatic SoA in CUDA? */
       ShaderDataTinyStorage emission_sd_storage;
       ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-      float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
+      Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
       if (is_zero(light_eval)) {
         return;
       }
 
       /* MIS weighting. */
+      float mis_weight = 1.0f;
       if (!(path_flag & PATH_RAY_MIS_SKIP)) {
         /* multiple importance sampling, get regular light pdf,
          * and compute weight with respect to BSDF pdf */
         const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-        const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
-        light_eval *= mis_weight;
+        mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
       }
 
       /* Write to render buffer. */
-      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      kernel_accum_emission(
-          kg, state, throughput * light_eval, render_buffer, kernel_data.background.lightgroup);
+      film_write_surface_emission(
+          kg, state, light_eval, mis_weight, render_buffer, kernel_data.background.lightgroup);
     }
   }
 }
@@ -213,7 +212,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
   }
 #endif
 
-  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/shade_light.h b/intern/cycles/kernel/integrator/shade_light.h
index be926c78439..f2d65eddfbb 100644
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/film/light_passes.h"
+#include "kernel/integrator/surface_shader.h"
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
 
@@ -22,19 +22,8 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
   const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
   const float ray_time = INTEGRATOR_STATE(state, ray, time);
 
-  /* Advance ray beyond light. */
-  /* TODO: can we make this more numerically robust to avoid reintersecting the
-   * same light in some cases? Ray should not intersect surface anymore as the
-   * object and prim ids will prevent self intersection. */
-  const float3 new_ray_P = ray_P + ray_D * isect.t;
-  INTEGRATOR_STATE_WRITE(state, ray, P) = new_ray_P;
-  INTEGRATOR_STATE_WRITE(state, ray, t) -= isect.t;
-
-  /* Set position to where the BSDF was sampled, for correct MIS PDF. */
-  const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
-  ray_P -= ray_D * mis_ray_t;
-  isect.t += mis_ray_t;
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = isect.t;
+  /* Advance ray to new start distance. */
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(isect.t);
 
   LightSample ls ccl_optional_struct_init;
   const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
@@ -62,23 +51,22 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
   /* TODO: does aliasing like this break automatic SoA in CUDA? */
   ShaderDataTinyStorage emission_sd_storage;
   ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
+  Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
   if (is_zero(light_eval)) {
     return;
   }
 
   /* MIS weighting. */
+  float mis_weight = 1.0f;
   if (!(path_flag & PATH_RAY_MIS_SKIP)) {
     /* multiple importance sampling, get regular light pdf,
      * and compute weight with respect to BSDF pdf */
     const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
-    light_eval *= mis_weight;
+    mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
   }
 
   /* Write to render buffer. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(kg, state, throughput * light_eval, render_buffer, ls.group);
+  film_write_surface_emission(kg, state, light_eval, mis_weight, render_buffer, ls.group);
 }
 
 ccl_device void integrator_shade_light(KernelGlobals kg,
@@ -99,11 +87,13 @@ ccl_device void integrator_shade_light(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(state, path, transparent_bounce) = transparent_bounce;
 
   if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
-    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
     return;
   }
   else {
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+    integrator_path_next(kg,
+                         state,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
                          DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
     return;
   }
diff --git a/intern/cycles/kernel/integrator/shade_shadow.h b/intern/cycles/kernel/integrator/shade_shadow.h
index 2b929b7b62e..ba18aed6ff0 100644
--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "kernel/integrator/shade_volume.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/surface_shader.h"
 #include "kernel/integrator/volume_stack.h"
 
 CCL_NAMESPACE_BEGIN
@@ -15,9 +15,9 @@ ccl_device_inline bool shadow_intersections_has_remaining(const uint num_hits)
 }
 
 #ifdef __TRANSPARENT_SHADOWS__
-ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
-                                                              IntegratorShadowState state,
-                                                              const int hit)
+ccl_device_inline Spectrum integrate_transparent_surface_shadow(KernelGlobals kg,
+                                                                IntegratorShadowState state,
+                                                                const int hit)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
 
@@ -40,7 +40,7 @@ ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
 
   /* Evaluate shader. */
   if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+    surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
         kg, state, shadow_sd, NULL, PATH_RAY_SHADOW);
   }
 
@@ -50,7 +50,7 @@ ccl_device_inline float3 integrate_transparent_surface_shadow(KernelGlobals kg,
 #  endif
 
   /* Compute transparency from closures. */
-  return shader_bsdf_transparency(kg, shadow_sd);
+  return surface_shader_transparency(kg, shadow_sd);
 }
 
 #  ifdef __VOLUME__
@@ -58,7 +58,7 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
                                                            IntegratorShadowState state,
                                                            const int hit,
                                                            const int num_recorded_hits,
-                                                           ccl_private float3 *ccl_restrict
+                                                           ccl_private Spectrum *ccl_restrict
                                                                throughput)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME);
@@ -75,13 +75,9 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
   ray.self.light_object = OBJECT_NONE;
   ray.self.light_prim = PRIM_NONE;
   /* Modify ray position and length to match current segment. */
-  const float start_t = (hit == 0) ? 0.0f :
-                                     INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit - 1, t);
-  const float end_t = (hit < num_recorded_hits) ?
-                          INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit, t) :
-                          ray.t;
-  ray.P += start_t * ray.D;
-  ray.t = end_t - start_t;
+  ray.tmin = (hit == 0) ? ray.tmin : INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit - 1, t);
+  ray.tmax = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit, t) :
+                                         ray.tmax;
 
   shader_setup_from_volume(kg, shadow_sd, &ray);
 
@@ -104,7 +100,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
     if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) {
 #  ifdef __VOLUME__
       if (!integrator_state_shadow_volume_stack_is_empty(kg, state)) {
-        float3 throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
+        Spectrum throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
         integrate_transparent_volume_shadow(kg, state, hit, num_recorded_hits, &throughput);
         if (is_zero(throughput)) {
           return true;
@@ -117,8 +113,8 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
 
     /* Surface shaders. */
     if (hit < num_recorded_hits) {
-      const float3 shadow = integrate_transparent_surface_shadow(kg, state, hit);
-      const float3 throughput = INTEGRATOR_STATE(state, shadow_path, throughput) * shadow;
+      const Spectrum shadow = integrate_transparent_surface_shadow(kg, state, hit);
+      const Spectrum throughput = INTEGRATOR_STATE(state, shadow_path, throughput) * shadow;
       if (is_zero(throughput)) {
         return true;
       }
@@ -137,10 +133,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
     /* There are more hits that we could not recorded due to memory usage,
      * adjust ray to intersect again from the last hit. */
     const float last_hit_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, num_recorded_hits - 1, t);
-    const float3 ray_P = INTEGRATOR_STATE(state, shadow_ray, P);
-    const float3 ray_D = INTEGRATOR_STATE(state, shadow_ray, D);
-    INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray_P + last_hit_t * ray_D;
-    INTEGRATOR_STATE_WRITE(state, shadow_ray, t) -= last_hit_t;
+    INTEGRATOR_STATE_WRITE(state, shadow_ray, tmin) = intersection_t_offset(last_hit_t);
   }
 
   return false;
@@ -158,20 +151,22 @@ ccl_device void integrator_shade_shadow(KernelGlobals kg,
   /* Evaluate transparent shadows. */
   const bool opaque = integrate_transparent_shadow(kg, state, num_hits);
   if (opaque) {
-    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
     return;
   }
 #endif
 
   if (shadow_intersections_has_remaining(num_hits)) {
     /* More intersections to find, continue shadow ray. */
-    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+    integrator_shadow_path_next(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
                                 DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
     return;
   }
   else {
-    kernel_accum_light(kg, state, render_buffer);
-    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    film_write_direct_light(kg, state, render_buffer);
+    integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
     return;
   }
 }
diff --git a/intern/cycles/kernel/integrator/shade_surface.h b/intern/cycles/kernel/integrator/shade_surface.h
index ce1398859b7..c19f56a9b70 100644
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -3,14 +3,15 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/film/passes.h"
+#include "kernel/film/data_passes.h"
+#include "kernel/film/denoising_passes.h"
+#include "kernel/film/light_passes.h"
 
 #include "kernel/integrator/mnee.h"
 
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
 #include "kernel/integrator/subsurface.h"
+#include "kernel/integrator/surface_shader.h"
 #include "kernel/integrator/volume_stack.h"
 
 #include "kernel/light/light.h"
@@ -31,7 +32,52 @@ ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
   shader_setup_from_ray(kg, sd, &ray, &isect);
 }
 
-#ifdef __HOLDOUT__
+ccl_device_forceinline float3 integrate_surface_ray_offset(KernelGlobals kg,
+                                                           const ccl_private ShaderData *sd,
+                                                           const float3 ray_P,
+                                                           const float3 ray_D)
+{
+  /* No ray offset needed for other primitive types. */
+  if (!(sd->type & PRIMITIVE_TRIANGLE)) {
+    return ray_P;
+  }
+
+  /* Self intersection tests already account for the case where a ray hits the
+   * same primitive. However precision issues can still cause neighboring
+   * triangles to be hit. Here we test if the ray-triangle intersection with
+   * the same primitive would miss, implying that a neighboring triangle would
+   * be hit instead.
+   *
+   * This relies on triangle intersection to be watertight, and the object inverse
+   * object transform to match the one used by ray intersection exactly.
+   *
+   * Potential improvements:
+   * - It appears this happens when either barycentric coordinates are small,
+   *   or dot(sd->Ng, ray_D)  is small. Detect such cases and skip test?
+   * - Instead of ray offset, can we tweak P to lie within the triangle?
+   */
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, sd->prim).w;
+  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
+                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
+                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+
+  float3 local_ray_P = ray_P;
+  float3 local_ray_D = ray_D;
+
+  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+    const Transform itfm = object_get_inverse_transform(kg, sd);
+    local_ray_P = transform_point(&itfm, local_ray_P);
+    local_ray_D = transform_direction(&itfm, local_ray_D);
+  }
+
+  if (ray_triangle_intersect_self(local_ray_P, local_ray_D, tri_a, tri_b, tri_c)) {
+    return ray_P;
+  }
+  else {
+    return ray_offset(ray_P, sd->Ng);
+  }
+}
+
 ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg,
                                                       ConstIntegratorState state,
                                                       ccl_private ShaderData *sd,
@@ -42,22 +88,18 @@ ccl_device_forceinline bool integrate_surface_holdout(KernelGlobals kg,
 
   if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
       (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    const float3 holdout_weight = shader_holdout_apply(kg, sd);
-    if (kernel_data.background.transparent) {
-      const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-      const float transparent = average(holdout_weight * throughput);
-      kernel_accum_holdout(kg, state, path_flag, transparent, render_buffer);
-    }
-    if (isequal_float3(holdout_weight, one_float3())) {
+    const Spectrum holdout_weight = surface_shader_apply_holdout(kg, sd);
+    const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+    const float transparent = average(holdout_weight * throughput);
+    film_write_holdout(kg, state, path_flag, transparent, render_buffer);
+    if (isequal(holdout_weight, one_spectrum())) {
       return false;
     }
   }
 
   return true;
 }
-#endif /* __HOLDOUT__ */
 
-#ifdef __EMISSION__
 ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
                                                        ConstIntegratorState state,
                                                        ccl_private const ShaderData *sd,
@@ -67,32 +109,29 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
 
   /* Evaluate emissive closure. */
-  float3 L = shader_emissive_eval(sd);
+  Spectrum L = surface_shader_emission(sd);
+  float mis_weight = 1.0f;
 
-#  ifdef __HAIR__
+#ifdef __HAIR__
   if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
       (sd->type & PRIMITIVE_TRIANGLE))
-#  else
+#else
   if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
-#  endif
+#endif
   {
     const float bsdf_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float t = sd->ray_length + INTEGRATOR_STATE(state, path, mis_ray_t);
+    const float t = sd->ray_length;
 
     /* Multiple importance sampling, get triangle light pdf,
      * and compute weight with respect to BSDF pdf. */
     float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
-    L *= mis_weight;
+    mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
   }
 
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_accum_emission(
-      kg, state, throughput * L, render_buffer, object_lightgroup(kg, sd->object));
+  film_write_surface_emission(
+      kg, state, L, mis_weight, render_buffer, object_lightgroup(kg, sd->object));
 }
-#endif /* __EMISSION__ */
 
-#ifdef __EMISSION__
 /* Path tracing: sample point on light and evaluate light shader, then
  * queue shadow ray to be traced. */
 template<uint node_feature_mask>
@@ -111,11 +150,10 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   {
     const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
     const uint bounce = INTEGRATOR_STATE(state, path, bounce);
-    float light_u, light_v;
-    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+    const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);
 
     if (!light_distribution_sample_from_position(
-            kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) {
+            kg, rand_light.x, rand_light.y, sd->time, sd->P, bounce, path_flag, &ls)) {
       return;
     }
   }
@@ -133,15 +171,15 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
 
   Ray ray ccl_optional_struct_init;
   BsdfEval bsdf_eval ccl_optional_struct_init;
-  const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D);
+  const bool is_transmission = surface_shader_is_transmission(sd, ls.D);
 
-#  ifdef __MNEE__
+#ifdef __MNEE__
   int mnee_vertex_count = 0;
   IF_KERNEL_FEATURE(MNEE)
   {
     if (ls.lamp != LAMP_NONE) {
       /* Is this a caustic light? */
-      const bool use_caustics = kernel_tex_fetch(__lights, ls.lamp).use_caustics;
+      const bool use_caustics = kernel_data_fetch(lights, ls.lamp).use_caustics;
       if (use_caustics) {
         /* Are we on a caustic caster? */
         if (is_transmission && (sd->object_flag & SD_OBJECT_CAUSTICS_CASTER))
@@ -161,16 +199,17 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
     light_sample_to_surface_shadow_ray(kg, emission_sd, &ls, &ray);
   }
   else
-#  endif /* __MNEE__ */
+#endif /* __MNEE__ */
   {
-    const float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, sd->time);
+    const Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, sd->time);
     if (is_zero(light_eval)) {
       return;
     }
 
     /* Evaluate BSDF. */
-    const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
-    bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
+    const float bsdf_pdf = surface_shader_bsdf_eval(
+        kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
+    bsdf_eval_mul(&bsdf_eval, light_eval / ls.pdf);
 
     if (ls.shader & SHADER_USE_MIS) {
       const float mis_weight = light_sample_mis_weight_nee(kg, ls.pdf, bsdf_pdf);
@@ -190,16 +229,20 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   const bool is_light = light_sample_is_light(&ls);
 
   /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(
-      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
+  IntegratorShadowState shadow_state = integrator_shadow_path_init(
+      kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, false);
 
   /* Copy volume stack and enter/exit volume. */
   integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
 
   if (is_transmission) {
-#  ifdef __VOLUME__
+#ifdef __VOLUME__
     shadow_volume_stack_enter_exit(kg, shadow_state, sd);
-#  endif
+#endif
+  }
+
+  if (ray.self.object != OBJECT_NONE) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
   }
 
   /* Write shadow ray and associated state to global memory. */
@@ -213,11 +256,12 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   /* Copy state from main path to shadow path. */
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput) *
+                              bsdf_eval_sum(&bsdf_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    packed_float3 pass_diffuse_weight;
-    packed_float3 pass_glossy_weight;
+    PackedSpectrum pass_diffuse_weight;
+    PackedSpectrum pass_glossy_weight;
 
     if (shadow_flag & PATH_RAY_ANY_PASS) {
       /* Indirect bounce, use weights from earlier surface or volume bounce. */
@@ -227,8 +271,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
     else {
       /* Direct light, use BSDFs at this bounce. */
       shadow_flag |= PATH_RAY_SURFACE_PASS;
-      pass_diffuse_weight = packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval));
-      pass_glossy_weight = packed_float3(bsdf_eval_pass_glossy_weight(&bsdf_eval));
+      pass_diffuse_weight = PackedSpectrum(bsdf_eval_pass_diffuse_weight(&bsdf_eval));
+      pass_glossy_weight = PackedSpectrum(bsdf_eval_pass_glossy_weight(&bsdf_eval));
     }
 
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
@@ -250,7 +294,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, glossy_bounce) = INTEGRATOR_STATE(
       state, path, glossy_bounce);
 
-#  ifdef __MNEE__
+#ifdef __MNEE__
   if (mnee_vertex_count > 0) {
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) =
         INTEGRATOR_STATE(state, path, transmission_bounce) + mnee_vertex_count - 1;
@@ -262,7 +306,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
                            bounce) = INTEGRATOR_STATE(state, path, bounce) + mnee_vertex_count;
   }
   else
-#  endif
+#endif
   {
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, transmission_bounce) = INTEGRATOR_STATE(
         state, path, transmission_bounce);
@@ -284,7 +328,6 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
                                                    ls.group + 1 :
                                                    kernel_data.background.lightgroup + 1;
 }
-#endif
 
 /* Path tracing: bounce off or through surface with new direction. */
 ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
@@ -298,9 +341,8 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
     return LABEL_NONE;
   }
 
-  float bsdf_u, bsdf_v;
-  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-  ccl_private const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u);
+  float2 rand_bsdf = path_state_rng_2D(kg, rng_state, PRNG_SURFACE_BSDF);
+  ccl_private const ShaderClosure *sc = surface_shader_bsdf_bssrdf_pick(sd, &rand_bsdf);
 
 #ifdef __SUBSURFACE__
   /* BSSRDF closure, we schedule subsurface intersection kernel. */
@@ -313,29 +355,33 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
   float bsdf_pdf;
   BsdfEval bsdf_eval ccl_optional_struct_init;
   float3 bsdf_omega_in ccl_optional_struct_init;
-  differential3 bsdf_domega_in ccl_optional_struct_init;
   int label;
 
-  label = shader_bsdf_sample_closure(
-      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+  label = surface_shader_bsdf_sample_closure(
+      kg, sd, sc, rand_bsdf, &bsdf_eval, &bsdf_omega_in, &bsdf_pdf);
 
   if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
     return LABEL_NONE;
   }
 
-  /* Setup ray. Note that clipping works through transparent bounces. */
-  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
-  INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
-  INTEGRATOR_STATE_WRITE(state, ray, t) = (label & LABEL_TRANSPARENT) ?
-                                              INTEGRATOR_STATE(state, ray, t) - sd->ray_length :
-                                              FLT_MAX;
+  if (label & LABEL_TRANSPARENT) {
+    /* Only need to modify start distance for transparent. */
+    INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(sd->ray_length);
+  }
+  else {
+    /* Setup ray with changed origin and direction. */
+    const float3 D = normalize(bsdf_omega_in);
+    INTEGRATOR_STATE_WRITE(state, ray, P) = integrate_surface_ray_offset(kg, sd, sd->P, D);
+    INTEGRATOR_STATE_WRITE(state, ray, D) = D;
+    INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
+    INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-  INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
-  INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(bsdf_domega_in);
+    INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
 #endif
+  }
 
   /* Update throughput. */
-  float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+  Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
   throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
   INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput;
 
@@ -349,12 +395,8 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
   }
 
   /* Update path state */
-  if (label & LABEL_TRANSPARENT) {
-    INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) += sd->ray_length;
-  }
-  else {
+  if (!(label & LABEL_TRANSPARENT)) {
     INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = bsdf_pdf;
-    INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
     INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
         bsdf_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
   }
@@ -371,17 +413,8 @@ ccl_device_forceinline int integrate_surface_volume_only_bounce(IntegratorState
     return LABEL_NONE;
   }
 
-  /* Setup ray position, direction stays unchanged. */
-  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
-
-  /* Clipping works through transparent. */
-  INTEGRATOR_STATE_WRITE(state, ray, t) -= sd->ray_length;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
-#  endif
-
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) += sd->ray_length;
+  /* Only modify start distance. */
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(sd->ray_length);
 
   return LABEL_TRANSMIT | LABEL_TRANSPARENT;
 }
@@ -416,23 +449,26 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
     return;
   }
 
-  float bsdf_u, bsdf_v;
-  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+  const float2 rand_bsdf = path_state_rng_2D(kg, rng_state, PRNG_SURFACE_BSDF);
 
   float3 ao_N;
-  const float3 ao_weight = shader_bsdf_ao(
+  const Spectrum ao_weight = surface_shader_ao(
       kg, sd, kernel_data.integrator.ao_additive_factor, &ao_N);
 
   float3 ao_D;
   float ao_pdf;
-  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+  sample_cos_hemisphere(ao_N, rand_bsdf.x, rand_bsdf.y, &ao_D, &ao_pdf);
 
   bool skip_self = true;
 
   Ray ray ccl_optional_struct_init;
   ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self);
   ray.D = ao_D;
-  ray.t = kernel_data.integrator.ao_bounces_distance;
+  if (skip_self) {
+    ray.P = integrate_surface_ray_offset(kg, sd, ray.P, ray.D);
+  }
+  ray.tmin = 0.0f;
+  ray.tmax = kernel_data.integrator.ao_bounces_distance;
   ray.time = sd->time;
   ray.self.object = (skip_self) ? sd->object : OBJECT_NONE;
   ray.self.prim = (skip_self) ? sd->prim : PRIM_NONE;
@@ -442,7 +478,8 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
   ray.dD = differential_zero_compact();
 
   /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao);
+  IntegratorShadowState shadow_state = integrator_shadow_path_init(
+      kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, true);
 
   /* Copy volume stack and enter/exit volume. */
   integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
@@ -458,7 +495,8 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
   const uint16_t bounce = INTEGRATOR_STATE(state, path, bounce);
   const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag) | PATH_RAY_SHADOW_FOR_AO;
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * shader_bsdf_alpha(kg, sd);
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput) *
+                              surface_shader_alpha(kg, sd);
 
   INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, render_pixel_index) = INTEGRATOR_STATE(
       state, path, render_pixel_index);
@@ -507,7 +545,7 @@ ccl_device bool integrate_surface(KernelGlobals kg,
     {
       /* Evaluate shader. */
       PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
-      shader_eval_surface<node_feature_mask>(kg, state, &sd, render_buffer, path_flag);
+      surface_shader_eval<node_feature_mask>(kg, state, &sd, render_buffer, path_flag);
 
       /* Initialize additional RNG for BSDFs. */
       if (sd.flag & SD_BSDF_NEEDS_LCG) {
@@ -529,21 +567,17 @@ ccl_device bool integrate_surface(KernelGlobals kg,
 #endif
     {
       /* Filter closures. */
-      shader_prepare_surface_closures(kg, state, &sd, path_flag);
+      surface_shader_prepare_closures(kg, state, &sd, path_flag);
 
-#ifdef __HOLDOUT__
       /* Evaluate holdout. */
       if (!integrate_surface_holdout(kg, state, &sd, render_buffer)) {
         return false;
       }
-#endif
 
-#ifdef __EMISSION__
       /* Write emission. */
       if (sd.flag & SD_EMISSION) {
         integrate_surface_emission(kg, state, &sd, render_buffer);
       }
-#endif
 
       /* Perform path termination. Most paths have already been terminated in
        * the intersect_closest kernel, this is just for emission and for dividing
@@ -557,11 +591,11 @@ ccl_device bool integrate_surface(KernelGlobals kg,
       /* Write render passes. */
 #ifdef __PASSES__
       PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES);
-      kernel_write_data_passes(kg, state, &sd, render_buffer);
+      film_write_data_passes(kg, state, &sd, render_buffer);
 #endif
 
 #ifdef __DENOISING_FEATURES__
-      kernel_write_denoising_features_surface(kg, state, &sd, render_buffer);
+      film_write_denoising_features_surface(kg, state, &sd, render_buffer);
 #endif
     }
 
@@ -604,22 +638,23 @@ ccl_device bool integrate_surface(KernelGlobals kg,
 }
 
 template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE,
-         int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
+         DeviceKernel current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
 ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg,
                                                      IntegratorState state,
                                                      ccl_global float *ccl_restrict render_buffer)
 {
   if (integrate_surface<node_feature_mask>(kg, state, render_buffer)) {
     if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SUBSURFACE) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+      integrator_path_next(
+          kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
     }
     else {
-      kernel_assert(INTEGRATOR_STATE(state, ray, t) != 0.0f);
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+      kernel_assert(INTEGRATOR_STATE(state, ray, tmax) != 0.0f);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
     }
   }
   else {
-    INTEGRATOR_PATH_TERMINATE(current_kernel);
+    integrator_path_terminate(kg, state, current_kernel);
   }
 }
 
diff --git a/intern/cycles/kernel/integrator/shade_volume.h b/intern/cycles/kernel/integrator/shade_volume.h
index 4a5015946aa..aaef92729d6 100644
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -3,12 +3,13 @@
 
 #pragma once
 
-#include "kernel/film/accumulate.h"
-#include "kernel/film/passes.h"
+#include "kernel/film/data_passes.h"
+#include "kernel/film/denoising_passes.h"
+#include "kernel/film/light_passes.h"
 
 #include "kernel/integrator/intersect_closest.h"
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/volume_shader.h"
 #include "kernel/integrator/volume_stack.h"
 
 #include "kernel/light/light.h"
@@ -29,13 +30,13 @@ typedef enum VolumeIntegrateEvent {
 typedef struct VolumeIntegrateResult {
   /* Throughput and offset for direct light scattering. */
   bool direct_scatter;
-  float3 direct_throughput;
+  Spectrum direct_throughput;
   float direct_t;
   ShaderVolumePhases direct_phases;
 
   /* Throughput and offset for indirect light scattering. */
   bool indirect_scatter;
-  float3 indirect_throughput;
+  Spectrum indirect_throughput;
   float indirect_t;
   ShaderVolumePhases indirect_phases;
 } VolumeIntegrateResult;
@@ -52,19 +53,19 @@ typedef struct VolumeIntegrateResult {
  * sigma_t = sigma_a + sigma_s */
 
 typedef struct VolumeShaderCoefficients {
-  float3 sigma_t;
-  float3 sigma_s;
-  float3 emission;
+  Spectrum sigma_t;
+  Spectrum sigma_s;
+  Spectrum emission;
 } VolumeShaderCoefficients;
 
 /* Evaluate shader to get extinction coefficient at P. */
 ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
                                                    IntegratorShadowState state,
                                                    ccl_private ShaderData *ccl_restrict sd,
-                                                   ccl_private float3 *ccl_restrict extinction)
+                                                   ccl_private Spectrum *ccl_restrict extinction)
 {
   VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
-  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);
+  volume_shader_eval<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);
 
   if (!(sd->flag & SD_EXTINCTION)) {
     return false;
@@ -83,15 +84,16 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
 {
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
   VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
-  shader_eval_volume<false>(kg, state, sd, path_flag, volume_read_lambda_pass);
+  volume_shader_eval<false>(kg, state, sd, path_flag, volume_read_lambda_pass);
 
   if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
     return false;
   }
 
-  coeff->sigma_s = zero_float3();
-  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
-  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
+  coeff->sigma_s = zero_spectrum();
+  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction :
+                                                zero_spectrum();
+  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_spectrum();
 
   if (sd->flag & SD_SCATTER) {
     for (int i = 0; i < sd->num_closure; i++) {
@@ -114,7 +116,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
 ccl_device_forceinline void volume_step_init(KernelGlobals kg,
                                              ccl_private const RNGState *rng_state,
                                              const float object_step_size,
-                                             float t,
+                                             const float tmin,
+                                             const float tmax,
                                              ccl_private float *step_size,
                                              ccl_private float *step_shade_offset,
                                              ccl_private float *steps_offset,
@@ -122,7 +125,7 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
 {
   if (object_step_size == FLT_MAX) {
     /* Homogeneous volume. */
-    *step_size = t;
+    *step_size = tmax - tmin;
     *step_shade_offset = 0.0f;
     *steps_offset = 1.0f;
     *max_steps = 1;
@@ -130,6 +133,7 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
   else {
     /* Heterogeneous volume. */
     *max_steps = kernel_data.integrator.volume_max_steps;
+    const float t = tmax - tmin;
     float step = min(object_step_size, t);
 
     /* compute exact steps in advance for malloc */
@@ -141,11 +145,11 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
 
     /* Perform shading at this offset within a step, to integrate over
      * over the entire step segment. */
-    *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4);
+    *step_shade_offset = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_SHADE_OFFSET);
 
     /* Shift starting point of all segment by this random amount to avoid
      * banding artifacts from the volume bounding shape. */
-    *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3);
+    *steps_offset = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_OFFSET);
   }
 }
 
@@ -160,12 +164,12 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
 ccl_device void volume_shadow_homogeneous(KernelGlobals kg, IntegratorState state,
                                           ccl_private Ray *ccl_restrict ray,
                                           ccl_private ShaderData *ccl_restrict sd,
-                                          ccl_global float3 *ccl_restrict throughput)
+                                          ccl_global Spectrum *ccl_restrict throughput)
 {
-  float3 sigma_t = zero_float3();
+  Spectrum sigma_t = zero_spectrum();
 
   if (shadow_volume_shader_sample(kg, state, sd, &sigma_t)) {
-    *throughput *= volume_color_transmittance(sigma_t, ray->t);
+    *throughput *= volume_color_transmittance(sigma_t, ray->tmax - ray->tmin);
   }
 }
 #  endif
@@ -176,14 +180,14 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
                                             IntegratorShadowState state,
                                             ccl_private Ray *ccl_restrict ray,
                                             ccl_private ShaderData *ccl_restrict sd,
-                                            ccl_private float3 *ccl_restrict throughput,
+                                            ccl_private Spectrum *ccl_restrict throughput,
                                             const float object_step_size)
 {
   /* Load random number state. */
   RNGState rng_state;
   shadow_path_state_rng_load(state, &rng_state);
 
-  float3 tp = *throughput;
+  Spectrum tp = *throughput;
 
   /* Prepare for stepping.
    * For shadows we do not offset all segments, since the starting point is
@@ -194,7 +198,8 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
   volume_step_init(kg,
                    &rng_state,
                    object_step_size,
-                   ray->t,
+                   ray->tmin,
+                   ray->tmax,
                    &step_size,
                    &step_shade_offset,
                    &unused,
@@ -202,17 +207,17 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
   const float steps_offset = 1.0f;
 
   /* compute extinction at the start */
-  float t = 0.0f;
+  float t = ray->tmin;
 
-  float3 sum = zero_float3();
+  Spectrum sum = zero_spectrum();
 
   for (int i = 0; i < max_steps; i++) {
     /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
+    float new_t = min(ray->tmax, ray->tmin + (i + steps_offset) * step_size);
     float dt = new_t - t;
 
     float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    float3 sigma_t = zero_float3();
+    Spectrum sigma_t = zero_spectrum();
 
     /* compute attenuation over segment */
     sd->P = new_P;
@@ -222,20 +227,19 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
        * check then. */
       sum += (-sigma_t * dt);
       if ((i & 0x07) == 0) { /* TODO: Other interval? */
-        tp = *throughput * exp3(sum);
+        tp = *throughput * exp(sum);
 
         /* stop if nearly all light is blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON)
+        if (reduce_max(tp) < VOLUME_THROUGHPUT_EPSILON)
           break;
       }
     }
 
     /* stop if at the end of the volume */
     t = new_t;
-    if (t == ray->t) {
+    if (t == ray->tmax) {
       /* Update throughput in case we haven't done it above */
-      tp = *throughput * exp3(sum);
+      tp = *throughput * exp(sum);
       break;
     }
   }
@@ -257,15 +261,16 @@ ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict r
                                            const float xi,
                                            ccl_private float *pdf)
 {
-  const float t = ray->t;
+  const float tmin = ray->tmin;
+  const float tmax = ray->tmax;
   const float delta = dot((light_P - ray->P), ray->D);
   const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
   if (UNLIKELY(D == 0.0f)) {
     *pdf = 0.0f;
     return 0.0f;
   }
-  const float theta_a = -atan2f(delta, D);
-  const float theta_b = atan2f(t - delta, D);
+  const float theta_a = atan2f(tmin - delta, D);
+  const float theta_b = atan2f(tmax - delta, D);
   const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
   if (UNLIKELY(theta_b == theta_a)) {
     *pdf = 0.0f;
@@ -273,7 +278,7 @@ ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict r
   }
   *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
 
-  return min(t, delta + t_); /* min is only for float precision errors */
+  return clamp(delta + t_, tmin, tmax); /* clamp is only for float precision errors */
 }
 
 ccl_device float volume_equiangular_pdf(ccl_private const Ray *ccl_restrict ray,
@@ -286,11 +291,12 @@ ccl_device float volume_equiangular_pdf(ccl_private const Ray *ccl_restrict ray,
     return 0.0f;
   }
 
-  const float t = ray->t;
+  const float tmin = ray->tmin;
+  const float tmax = ray->tmax;
   const float t_ = sample_t - delta;
 
-  const float theta_a = -atan2f(delta, D);
-  const float theta_b = atan2f(t - delta, D);
+  const float theta_a = atan2f(tmin - delta, D);
+  const float theta_b = atan2f(tmax - delta, D);
   if (UNLIKELY(theta_b == theta_a)) {
     return 0.0f;
   }
@@ -310,11 +316,12 @@ ccl_device float volume_equiangular_cdf(ccl_private const Ray *ccl_restrict ray,
     return 0.0f;
   }
 
-  const float t = ray->t;
+  const float tmin = ray->tmin;
+  const float tmax = ray->tmax;
   const float t_ = sample_t - delta;
 
-  const float theta_a = -atan2f(delta, D);
-  const float theta_b = atan2f(t - delta, D);
+  const float theta_a = atan2f(tmin - delta, D);
+  const float theta_b = atan2f(tmax - delta, D);
   if (UNLIKELY(theta_b == theta_a)) {
     return 0.0f;
   }
@@ -328,22 +335,22 @@ ccl_device float volume_equiangular_cdf(ccl_private const Ray *ccl_restrict ray,
 /* Distance sampling */
 
 ccl_device float volume_distance_sample(float max_t,
-                                        float3 sigma_t,
+                                        Spectrum sigma_t,
                                         int channel,
                                         float xi,
-                                        ccl_private float3 *transmittance,
-                                        ccl_private float3 *pdf)
+                                        ccl_private Spectrum *transmittance,
+                                        ccl_private Spectrum *pdf)
 {
   /* xi is [0, 1[ so log(0) should never happen, division by zero is
    * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
   float sample_sigma_t = volume_channel_get(sigma_t, channel);
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  Spectrum full_transmittance = volume_color_transmittance(sigma_t, max_t);
   float sample_transmittance = volume_channel_get(full_transmittance, channel);
 
   float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
 
   *transmittance = volume_color_transmittance(sigma_t, sample_t);
-  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
+  *pdf = safe_divide_color(sigma_t * *transmittance, one_spectrum() - full_transmittance);
 
   /* todo: optimization: when taken together with hit/miss decision,
    * the full_transmittance cancels out drops out and xi does not
@@ -352,33 +359,36 @@ ccl_device float volume_distance_sample(float max_t,
   return sample_t;
 }
 
-ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+ccl_device Spectrum volume_distance_pdf(float max_t, Spectrum sigma_t, float sample_t)
 {
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+  Spectrum full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  Spectrum transmittance = volume_color_transmittance(sigma_t, sample_t);
 
-  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
+  return safe_divide_color(sigma_t * transmittance, one_spectrum() - full_transmittance);
 }
 
 /* Emission */
 
-ccl_device float3 volume_emission_integrate(ccl_private VolumeShaderCoefficients *coeff,
-                                            int closure_flag,
-                                            float3 transmittance,
-                                            float t)
+ccl_device Spectrum volume_emission_integrate(ccl_private VolumeShaderCoefficients *coeff,
+                                              int closure_flag,
+                                              Spectrum transmittance,
+                                              float t)
 {
   /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
    * this goes to E * t as sigma_t goes to zero
    *
    * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
-  float3 emission = coeff->emission;
+  Spectrum emission = coeff->emission;
 
   if (closure_flag & SD_EXTINCTION) {
-    float3 sigma_t = coeff->sigma_t;
+    Spectrum sigma_t = coeff->sigma_t;
 
-    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
-    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
-    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
+    FOREACH_SPECTRUM_CHANNEL (i) {
+      GET_SPECTRUM_CHANNEL(emission, i) *= (GET_SPECTRUM_CHANNEL(sigma_t, i) > 0.0f) ?
+                                               (1.0f - GET_SPECTRUM_CHANNEL(transmittance, i)) /
+                                                   GET_SPECTRUM_CHANNEL(sigma_t, i) :
+                                               t;
+    }
   }
   else
     emission *= t;
@@ -390,8 +400,8 @@ ccl_device float3 volume_emission_integrate(ccl_private VolumeShaderCoefficients
 
 typedef struct VolumeIntegrateState {
   /* Volume segment extents. */
-  float start_t;
-  float end_t;
+  float tmin;
+  float tmax;
 
   /* If volume is absorption-only up to this point, and no probabilistic
    * scattering or termination has been used yet. */
@@ -413,27 +423,27 @@ ccl_device_forceinline void volume_integrate_step_scattering(
     ccl_private const Ray *ray,
     const float3 equiangular_light_P,
     ccl_private const VolumeShaderCoefficients &ccl_restrict coeff,
-    const float3 transmittance,
+    const Spectrum transmittance,
     ccl_private VolumeIntegrateState &ccl_restrict vstate,
     ccl_private VolumeIntegrateResult &ccl_restrict result)
 {
   /* Pick random color channel, we use the Veach one-sample
    * model with balance heuristic for the channels. */
-  const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-  float3 channel_pdf;
+  const Spectrum albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+  Spectrum channel_pdf;
   const int channel = volume_sample_channel(
       albedo, result.indirect_throughput, vstate.rphase, &channel_pdf);
 
   /* Equiangular sampling for direct lighting. */
   if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
-    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t &&
+    if (result.direct_t >= vstate.tmin && result.direct_t <= vstate.tmax &&
         vstate.equiangular_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
-      const float new_dt = result.direct_t - vstate.start_t;
-      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const float new_dt = result.direct_t - vstate.tmin;
+      const Spectrum new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
 
       result.direct_scatter = true;
       result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf;
-      shader_copy_volume_phases(&result.direct_phases, sd);
+      volume_shader_copy_phases(&result.direct_phases, sd);
 
       /* Multiple importance sampling. */
       if (vstate.use_mis) {
@@ -458,10 +468,10 @@ ccl_device_forceinline void volume_integrate_step_scattering(
       /* compute sampling distance */
       const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel);
       const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t;
-      const float new_t = vstate.start_t + new_dt;
+      const float new_t = vstate.tmin + new_dt;
 
       /* transmittance and pdf */
-      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const Spectrum new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
       const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);
 
       if (vstate.distance_pdf * distance_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
@@ -469,7 +479,7 @@ ccl_device_forceinline void volume_integrate_step_scattering(
         result.indirect_scatter = true;
         result.indirect_t = new_t;
         result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
-        shader_copy_volume_phases(&result.indirect_phases, sd);
+        volume_shader_copy_phases(&result.indirect_phases, sd);
 
         if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
           /* If using distance sampling for direct light, just copy parameters
@@ -477,7 +487,7 @@ ccl_device_forceinline void volume_integrate_step_scattering(
           result.direct_scatter = true;
           result.direct_t = result.indirect_t;
           result.direct_throughput = result.indirect_throughput;
-          shader_copy_volume_phases(&result.direct_phases, sd);
+          volume_shader_copy_phases(&result.direct_phases, sd);
 
           /* Multiple importance sampling. */
           if (vstate.use_mis) {
@@ -528,7 +538,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
   volume_step_init(kg,
                    rng_state,
                    object_step_size,
-                   ray->t,
+                   ray->tmin,
+                   ray->tmax,
                    &step_size,
                    &step_shade_offset,
                    &steps_offset,
@@ -536,11 +547,11 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 
   /* Initialize volume integration state. */
   VolumeIntegrateState vstate ccl_optional_struct_init;
-  vstate.start_t = 0.0f;
-  vstate.end_t = 0.0f;
+  vstate.tmin = ray->tmin;
+  vstate.tmax = ray->tmin;
   vstate.absorption_only = true;
-  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
-  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
+  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_SCATTER_DISTANCE);
+  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_VOLUME_PHASE_CHANNEL);
 
   /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
   vstate.direct_sample_method = direct_sample_method;
@@ -559,7 +570,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
   vstate.distance_pdf = 1.0f;
 
   /* Initialize volume integration result. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
   result.direct_throughput = throughput;
   result.indirect_throughput = throughput;
 
@@ -572,14 +583,14 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 #  ifdef __DENOISING_FEATURES__
   const bool write_denoising_features = (INTEGRATOR_STATE(state, path, flag) &
                                          PATH_RAY_DENOISING_FEATURES);
-  float3 accum_albedo = zero_float3();
+  Spectrum accum_albedo = zero_spectrum();
 #  endif
-  float3 accum_emission = zero_float3();
+  Spectrum accum_emission = zero_spectrum();
 
   for (int i = 0; i < max_steps; i++) {
     /* Advance to new position */
-    vstate.end_t = min(ray->t, (i + steps_offset) * step_size);
-    const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset;
+    vstate.tmax = min(ray->tmax, ray->tmin + (i + steps_offset) * step_size);
+    const float shade_t = vstate.tmin + (vstate.tmax - vstate.tmin) * step_shade_offset;
     sd->P = ray->P + ray->D * shade_t;
 
     /* compute segment */
@@ -588,17 +599,17 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
       const int closure_flag = sd->flag;
 
       /* Evaluate transmittance over segment. */
-      const float dt = (vstate.end_t - vstate.start_t);
-      const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
-                                       volume_color_transmittance(coeff.sigma_t, dt) :
-                                       one_float3();
+      const float dt = (vstate.tmax - vstate.tmin);
+      const Spectrum transmittance = (closure_flag & SD_EXTINCTION) ?
+                                         volume_color_transmittance(coeff.sigma_t, dt) :
+                                         one_spectrum();
 
       /* Emission. */
       if (closure_flag & SD_EMISSION) {
         /* Only write emission before indirect light scatter position, since we terminate
          * stepping at that point if we have already found a direct light scatter position. */
         if (!result.indirect_scatter) {
-          const float3 emission = volume_emission_integrate(
+          const Spectrum emission = volume_emission_integrate(
               &coeff, closure_flag, transmittance, dt);
           accum_emission += result.indirect_throughput * emission;
         }
@@ -609,8 +620,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 #  ifdef __DENOISING_FEATURES__
           /* Accumulate albedo for denoising features. */
           if (write_denoising_features && (closure_flag & SD_SCATTER)) {
-            const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-            accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance);
+            const Spectrum albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+            accum_albedo += result.indirect_throughput * albedo * (one_spectrum() - transmittance);
           }
 #  endif
 
@@ -626,13 +637,13 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 
         /* Stop if nearly all light blocked. */
         if (!result.indirect_scatter) {
-          if (max3(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
-            result.indirect_throughput = zero_float3();
+          if (reduce_max(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            result.indirect_throughput = zero_spectrum();
             break;
           }
         }
         else if (!result.direct_scatter) {
-          if (max3(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+          if (reduce_max(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) {
             break;
           }
         }
@@ -645,28 +656,27 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
     }
 
     /* Stop if at the end of the volume. */
-    vstate.start_t = vstate.end_t;
-    if (vstate.start_t == ray->t) {
+    vstate.tmin = vstate.tmax;
+    if (vstate.tmin == ray->tmax) {
       break;
     }
   }
 
   /* Write accumulated emission. */
   if (!is_zero(accum_emission)) {
-    kernel_accum_emission(
+    film_write_volume_emission(
         kg, state, accum_emission, render_buffer, object_lightgroup(kg, sd->object));
   }
 
 #  ifdef __DENOISING_FEATURES__
   /* Write denoising features. */
   if (write_denoising_features) {
-    kernel_write_denoising_features_volume(
+    film_write_denoising_features_volume(
         kg, state, accum_albedo, result.indirect_scatter, render_buffer);
   }
 #  endif /* __DENOISING_FEATURES__ */
 }
 
-#  ifdef __EMISSION__
 /* Path tracing: sample point on light and evaluate light shader, then
  * queue shadow ray to be traced. */
 ccl_device_forceinline bool integrate_volume_sample_light(
@@ -684,11 +694,10 @@ ccl_device_forceinline bool integrate_volume_sample_light(
   /* Sample position on a light. */
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
   const uint bounce = INTEGRATOR_STATE(state, path, bounce);
-  float light_u, light_v;
-  path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+  const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);
 
   if (!light_distribution_sample_from_volume_segment(
-          kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls)) {
+          kg, rand_light.x, rand_light.y, sd->time, sd->P, bounce, path_flag, ls)) {
     return false;
   }
 
@@ -708,7 +717,7 @@ ccl_device_forceinline void integrate_volume_direct_light(
     ccl_private const RNGState *ccl_restrict rng_state,
     const float3 P,
     ccl_private const ShaderVolumePhases *ccl_restrict phases,
-    ccl_private const float3 throughput,
+    ccl_private const Spectrum throughput,
     ccl_private LightSample *ccl_restrict ls)
 {
   PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);
@@ -725,11 +734,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
   {
     const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
     const uint bounce = INTEGRATOR_STATE(state, path, bounce);
-    float light_u, light_v;
-    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+    const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);
 
     if (!light_distribution_sample_from_position(
-            kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) {
+            kg, rand_light.x, rand_light.y, sd->time, P, bounce, path_flag, ls)) {
       return;
     }
   }
@@ -746,21 +754,21 @@ ccl_device_forceinline void integrate_volume_direct_light(
    * non-constant light sources. */
   ShaderDataTinyStorage emission_sd_storage;
   ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  const float3 light_eval = light_sample_shader_eval(kg, state, emission_sd, ls, sd->time);
+  const Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, ls, sd->time);
   if (is_zero(light_eval)) {
     return;
   }
 
   /* Evaluate BSDF. */
   BsdfEval phase_eval ccl_optional_struct_init;
-  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
+  const float phase_pdf = volume_shader_phase_eval(kg, sd, phases, ls->D, &phase_eval);
 
   if (ls->shader & SHADER_USE_MIS) {
     float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf);
     bsdf_eval_mul(&phase_eval, mis_weight);
   }
 
-  bsdf_eval_mul3(&phase_eval, light_eval / ls->pdf);
+  bsdf_eval_mul(&phase_eval, light_eval / ls->pdf);
 
   /* Path termination. */
   const float terminate = path_state_rng_light_termination(kg, rng_state);
@@ -774,8 +782,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
   const bool is_light = light_sample_is_light(ls);
 
   /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(
-      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
+  IntegratorShadowState shadow_state = integrator_shadow_path_init(
+      kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, false);
 
   /* Write shadow ray and associated state to global memory. */
   integrator_state_write_shadow_ray(kg, shadow_state, &ray);
@@ -789,11 +797,11 @@ ccl_device_forceinline void integrate_volume_direct_light(
   const uint16_t transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
   uint32_t shadow_flag = INTEGRATOR_STATE(state, path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
-  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
+  const Spectrum throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    packed_float3 pass_diffuse_weight;
-    packed_float3 pass_glossy_weight;
+    PackedSpectrum pass_diffuse_weight;
+    PackedSpectrum pass_glossy_weight;
 
     if (shadow_flag & PATH_RAY_ANY_PASS) {
       /* Indirect bounce, use weights from earlier surface or volume bounce. */
@@ -803,8 +811,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
     else {
       /* Direct light, no diffuse/glossy distinction needed for volumes. */
       shadow_flag |= PATH_RAY_VOLUME_PASS;
-      pass_diffuse_weight = packed_float3(one_float3());
-      pass_glossy_weight = packed_float3(zero_float3());
+      pass_diffuse_weight = one_spectrum();
+      pass_glossy_weight = zero_spectrum();
     }
 
     INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
@@ -842,7 +850,6 @@ ccl_device_forceinline void integrate_volume_direct_light(
 
   integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
 }
-#  endif
 
 /* Path tracing: scatter in new direction using phase function */
 ccl_device_forceinline bool integrate_volume_phase_scatter(
@@ -854,24 +861,15 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
 {
   PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT);
 
-  float phase_u, phase_v;
-  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v);
+  const float2 rand_phase = path_state_rng_2D(kg, rng_state, PRNG_VOLUME_PHASE);
 
   /* Phase closure, sample direction. */
   float phase_pdf;
   BsdfEval phase_eval ccl_optional_struct_init;
   float3 phase_omega_in ccl_optional_struct_init;
-  differential3 phase_domega_in ccl_optional_struct_init;
-
-  const int label = shader_volume_phase_sample(kg,
-                                               sd,
-                                               phases,
-                                               phase_u,
-                                               phase_v,
-                                               &phase_eval,
-                                               &phase_omega_in,
-                                               &phase_domega_in,
-                                               &phase_pdf);
+
+  const int label = volume_shader_phase_sample(
+      kg, sd, phases, rand_phase, &phase_eval, &phase_omega_in, &phase_pdf);
 
   if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
     return false;
@@ -880,28 +878,27 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
   /* Setup ray. */
   INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
   INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(phase_omega_in);
-  INTEGRATOR_STATE_WRITE(state, ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #  ifdef __RAY_DIFFERENTIALS__
   INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
-  INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(phase_domega_in);
 #  endif
   // Save memory by storing last hit prim and object in isect
   INTEGRATOR_STATE_WRITE(state, isect, prim) = sd->prim;
   INTEGRATOR_STATE_WRITE(state, isect, object) = sd->object;
 
   /* Update throughput. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
+  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
+  const Spectrum throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
   INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput_phase;
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-    INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+    INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_spectrum();
+    INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_spectrum();
   }
 
   /* Update path state */
   INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = phase_pdf;
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
   INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
       phase_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
 
@@ -1021,7 +1018,7 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
   integrator_state_read_isect(kg, state, &isect);
 
   /* Set ray length to current segment. */
-  ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
+  ray.tmax = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
 
   /* Clean volume stack for background rays. */
   if (isect.prim == PRIM_NONE) {
@@ -1032,13 +1029,15 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
 
   if (event == VOLUME_PATH_SCATTERED) {
     /* Queue intersect_closest kernel. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+    integrator_path_next(kg,
+                         state,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
                          DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
     return;
   }
   else if (event == VOLUME_PATH_MISSED) {
     /* End path. */
-    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
     return;
   }
   else {
diff --git a/intern/cycles/kernel/integrator/shader_eval.h b/intern/cycles/kernel/integrator/shader_eval.h
deleted file mode 100644
index 4da92929366..00000000000
--- a/intern/cycles/kernel/integrator/shader_eval.h
+++ /dev/null
@@ -1,952 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-/* Functions to evaluate shaders and use the resulting shader closures. */
-
-#pragma once
-
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/bsdf.h"
-#include "kernel/closure/bsdf_util.h"
-#include "kernel/closure/emissive.h"
-
-#include "kernel/film/accumulate.h"
-
-#include "kernel/svm/svm.h"
-
-#ifdef __OSL__
-#  include "kernel/osl/shader.h"
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* Merging */
-
-#if defined(__VOLUME__)
-ccl_device_inline void shader_merge_volume_closures(ccl_private ShaderData *sd)
-{
-  /* Merge identical closures to save closure space with stacked volumes. */
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private ShaderClosure *sci = &sd->closure[i];
-
-    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
-      continue;
-    }
-
-    for (int j = i + 1; j < sd->num_closure; j++) {
-      ccl_private ShaderClosure *scj = &sd->closure[j];
-      if (sci->type != scj->type) {
-        continue;
-      }
-
-      ccl_private const HenyeyGreensteinVolume *hgi = (ccl_private const HenyeyGreensteinVolume *)
-          sci;
-      ccl_private const HenyeyGreensteinVolume *hgj = (ccl_private const HenyeyGreensteinVolume *)
-          scj;
-      if (!(hgi->g == hgj->g)) {
-        continue;
-      }
-
-      sci->weight += scj->weight;
-      sci->sample_weight += scj->sample_weight;
-
-      int size = sd->num_closure - (j + 1);
-      if (size > 0) {
-        for (int k = 0; k < size; k++) {
-          scj[k] = scj[k + 1];
-        }
-      }
-
-      sd->num_closure--;
-      kernel_assert(sd->num_closure >= 0);
-      j--;
-    }
-  }
-}
-
-ccl_device_inline void shader_copy_volume_phases(ccl_private ShaderVolumePhases *ccl_restrict
-                                                     phases,
-                                                 ccl_private const ShaderData *ccl_restrict sd)
-{
-  phases->num_closure = 0;
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *from_sc = &sd->closure[i];
-    ccl_private const HenyeyGreensteinVolume *from_hg =
-        (ccl_private const HenyeyGreensteinVolume *)from_sc;
-
-    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
-      ccl_private ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
-
-      to_sc->weight = from_sc->weight;
-      to_sc->sample_weight = from_sc->sample_weight;
-      to_sc->g = from_hg->g;
-      phases->num_closure++;
-      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
-        break;
-      }
-    }
-  }
-}
-#endif /* __VOLUME__ */
-
-ccl_device_inline void shader_prepare_surface_closures(KernelGlobals kg,
-                                                       ConstIntegratorState state,
-                                                       ccl_private ShaderData *sd,
-                                                       const uint32_t path_flag)
-{
-  /* Filter out closures. */
-  if (kernel_data.integrator.filter_closures) {
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_EMISSION) {
-      sd->closure_emission_background = zero_float3();
-    }
-
-    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIRECT_LIGHT) {
-      sd->flag &= ~SD_BSDF_HAS_EVAL;
-    }
-
-    if (path_flag & PATH_RAY_CAMERA) {
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-
-        if ((CLOSURE_IS_BSDF_DIFFUSE(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE)) ||
-            (CLOSURE_IS_BSDF_GLOSSY(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY)) ||
-            (CLOSURE_IS_BSDF_TRANSMISSION(sc->type) &&
-             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION))) {
-          sc->type = CLOSURE_NONE_ID;
-          sc->sample_weight = 0.0f;
-        }
-        else if ((CLOSURE_IS_BSDF_TRANSPARENT(sc->type) &&
-                  (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSPARENT))) {
-          sc->type = CLOSURE_HOLDOUT_ID;
-          sc->sample_weight = 0.0f;
-          sd->flag |= SD_HOLDOUT;
-        }
-      }
-    }
-  }
-
-  /* Defensive sampling.
-   *
-   * We can likely also do defensive sampling at deeper bounces, particularly
-   * for cases like a perfect mirror but possibly also others. This will need
-   * a good heuristic. */
-  if (INTEGRATOR_STATE(state, path, bounce) + INTEGRATOR_STATE(state, path, transparent_bounce) ==
-          0 &&
-      sd->num_closure > 1) {
-    float sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private ShaderClosure *sc = &sd->closure[i];
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sum += sc->sample_weight;
-      }
-    }
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private ShaderClosure *sc = &sd->closure[i];
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
-      }
-    }
-  }
-
-  /* Filter glossy.
-   *
-   * Blurring of bsdf after bounces, for rays that have a small likelihood
-   * of following this particular path (diffuse, rough glossy) */
-  if (kernel_data.integrator.filter_glossy != FLT_MAX
-#ifdef __MNEE__
-      && !(INTEGRATOR_STATE(state, path, mnee) & PATH_MNEE_VALID)
-#endif
-  ) {
-    float blur_pdf = kernel_data.integrator.filter_glossy *
-                     INTEGRATOR_STATE(state, path, min_ray_pdf);
-
-    if (blur_pdf < 1.0f) {
-      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
-
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-        if (CLOSURE_IS_BSDF(sc->type)) {
-          bsdf_blur(kg, sc, blur_roughness);
-        }
-      }
-    }
-  }
-}
-
-/* BSDF */
-
-ccl_device_inline bool shader_bsdf_is_transmission(ccl_private const ShaderData *sd,
-                                                   const float3 omega_in)
-{
-  return dot(sd->N, omega_in) < 0.0f;
-}
-
-ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
-{
-  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
-    return false;
-  }
-  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type)) {
-      return true;
-    }
-  }
-  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
-    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
-      return true;
-    }
-  }
-  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
-    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-ccl_device_inline float _shader_bsdf_multi_eval(KernelGlobals kg,
-                                                ccl_private ShaderData *sd,
-                                                const float3 omega_in,
-                                                const bool is_transmission,
-                                                ccl_private const ShaderClosure *skip_sc,
-                                                ccl_private BsdfEval *result_eval,
-                                                float sum_pdf,
-                                                float sum_sample_weight,
-                                                const uint light_shader_flags)
-{
-  /* This is the veach one-sample model with balance heuristic,
-   * some PDF factors drop out when using balance heuristic weighting. */
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (sc == skip_sc) {
-      continue;
-    }
-
-    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-      if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
-        float bsdf_pdf = 0.0f;
-        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
-
-        if (bsdf_pdf != 0.0f) {
-          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
-          sum_pdf += bsdf_pdf * sc->sample_weight;
-        }
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-
-  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_inline
-#endif
-    float
-    shader_bsdf_eval(KernelGlobals kg,
-                     ccl_private ShaderData *sd,
-                     const float3 omega_in,
-                     const bool is_transmission,
-                     ccl_private BsdfEval *bsdf_eval,
-                     const uint light_shader_flags)
-{
-  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, zero_float3());
-
-  return _shader_bsdf_multi_eval(
-      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
-}
-
-/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
-ccl_device_inline ccl_private const ShaderClosure *shader_bsdf_bssrdf_pick(
-    ccl_private const ShaderData *ccl_restrict sd, ccl_private float *randu)
-{
-  int sampled = 0;
-
-  if (sd->num_closure > 1) {
-    /* Pick a BSDF or based on sample weights. */
-    float sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sum += sc->sample_weight;
-      }
-    }
-
-    float r = (*randu) * sum;
-    float partial_sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r < next_sum) {
-          sampled = i;
-
-          /* Rescale to reuse for direction sample, to better preserve stratification. */
-          *randu = (r - partial_sum) / sc->sample_weight;
-          break;
-        }
-
-        partial_sum = next_sum;
-      }
-    }
-  }
-
-  return &sd->closure[sampled];
-}
-
-/* Return weight for picked BSSRDF. */
-ccl_device_inline float3
-shader_bssrdf_sample_weight(ccl_private const ShaderData *ccl_restrict sd,
-                            ccl_private const ShaderClosure *ccl_restrict bssrdf_sc)
-{
-  float3 weight = bssrdf_sc->weight;
-
-  if (sd->num_closure > 1) {
-    float sum = 0.0f;
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        sum += sc->sample_weight;
-      }
-    }
-    weight *= sum / bssrdf_sc->sample_weight;
-  }
-
-  return weight;
-}
-
-/* Sample direction for picked BSDF, and return evaluation and pdf for all
- * BSDFs combined using MIS. */
-ccl_device int shader_bsdf_sample_closure(KernelGlobals kg,
-                                          ccl_private ShaderData *sd,
-                                          ccl_private const ShaderClosure *sc,
-                                          float randu,
-                                          float randv,
-                                          ccl_private BsdfEval *bsdf_eval,
-                                          ccl_private float3 *omega_in,
-                                          ccl_private differential3 *domega_in,
-                                          ccl_private float *pdf)
-{
-  /* BSSRDF should already have been handled elsewhere. */
-  kernel_assert(CLOSURE_IS_BSDF(sc->type));
-
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);
-
-    if (sd->num_closure > 1) {
-      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
-      float sweight = sc->sample_weight;
-      *pdf = _shader_bsdf_multi_eval(
-          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
-    }
-  }
-
-  return label;
-}
-
-ccl_device float shader_bsdf_average_roughness(ccl_private const ShaderData *sd)
-{
-  float roughness = 0.0f;
-  float sum_weight = 0.0f;
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF(sc->type)) {
-      /* sqrt once to undo the squaring from multiplying roughness on the
-       * two axes, and once for the squared roughness convention. */
-      float weight = fabsf(average(sc->weight));
-      roughness += weight * sqrtf(safe_sqrtf(bsdf_get_roughness_squared(sc)));
-      sum_weight += weight;
-    }
-  }
-
-  return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
-}
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  if (sd->flag & SD_HAS_ONLY_VOLUME) {
-    return one_float3();
-  }
-  else if (sd->flag & SD_TRANSPARENT) {
-    return sd->closure_transparent_extinction;
-  }
-  else {
-    return zero_float3();
-  }
-}
-
-ccl_device void shader_bsdf_disable_transparency(KernelGlobals kg, ccl_private ShaderData *sd)
-{
-  if (sd->flag & SD_TRANSPARENT) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private ShaderClosure *sc = &sd->closure[i];
-
-      if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
-        sc->sample_weight = 0.0f;
-        sc->weight = zero_float3();
-      }
-    }
-
-    sd->flag &= ~SD_TRANSPARENT;
-  }
-}
-
-ccl_device float3 shader_bsdf_alpha(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
-
-  alpha = max(alpha, zero_float3());
-  alpha = min(alpha, one_float3());
-
-  return alpha;
-}
-
-ccl_device float3 shader_bsdf_diffuse(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 eval = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
-ccl_device float3 shader_bsdf_glossy(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 eval = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
-ccl_device float3 shader_bsdf_transmission(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 eval = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
-      eval += sc->weight;
-  }
-
-  return eval;
-}
-
-ccl_device float3 shader_bsdf_average_normal(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  float3 N = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
-      N += sc->N * fabsf(average(sc->weight));
-  }
-
-  return (is_zero(N)) ? sd->N : normalize(N);
-}
-
-ccl_device float3 shader_bsdf_ao(KernelGlobals kg,
-                                 ccl_private const ShaderData *sd,
-                                 const float ao_factor,
-                                 ccl_private float3 *N_)
-{
-  float3 eval = zero_float3();
-  float3 N = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
-      ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
-      eval += sc->weight * ao_factor;
-      N += bsdf->N * fabsf(average(sc->weight));
-    }
-  }
-
-  *N_ = (is_zero(N)) ? sd->N : normalize(N);
-  return eval;
-}
-
-#ifdef __SUBSURFACE__
-ccl_device float3 shader_bssrdf_normal(ccl_private const ShaderData *sd)
-{
-  float3 N = zero_float3();
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    ccl_private const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSSRDF(sc->type)) {
-      ccl_private const Bssrdf *bssrdf = (ccl_private const Bssrdf *)sc;
-      float avg_weight = fabsf(average(sc->weight));
-
-      N += bssrdf->N * avg_weight;
-    }
-  }
-
-  return (is_zero(N)) ? sd->N : normalize(N);
-}
-#endif /* __SUBSURFACE__ */
-
-/* Constant emission optimization */
-
-ccl_device bool shader_constant_emission_eval(KernelGlobals kg,
-                                              int shader,
-                                              ccl_private float3 *eval)
-{
-  int shader_index = shader & SHADER_MASK;
-  int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
-
-  if (shader_flag & SD_HAS_CONSTANT_EMISSION) {
-    *eval = make_float3(kernel_tex_fetch(__shaders, shader_index).constant_emission[0],
-                        kernel_tex_fetch(__shaders, shader_index).constant_emission[1],
-                        kernel_tex_fetch(__shaders, shader_index).constant_emission[2]);
-
-    return true;
-  }
-
-  return false;
-}
-
-/* Background */
-
-ccl_device float3 shader_background_eval(ccl_private const ShaderData *sd)
-{
-  if (sd->flag & SD_EMISSION) {
-    return sd->closure_emission_background;
-  }
-  else {
-    return zero_float3();
-  }
-}
-
-/* Emission */
-
-ccl_device float3 shader_emissive_eval(ccl_private const ShaderData *sd)
-{
-  if (sd->flag & SD_EMISSION) {
-    return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
-  }
-  else {
-    return zero_float3();
-  }
-}
-
-/* Holdout */
-
-ccl_device float3 shader_holdout_apply(KernelGlobals kg, ccl_private ShaderData *sd)
-{
-  float3 weight = zero_float3();
-
-  /* For objects marked as holdout, preserve transparency and remove all other
-   * closures, replacing them with a holdout weight. */
-  if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-    if ((sd->flag & SD_TRANSPARENT) && !(sd->flag & SD_HAS_ONLY_VOLUME)) {
-      weight = one_float3() - sd->closure_transparent_extinction;
-
-      for (int i = 0; i < sd->num_closure; i++) {
-        ccl_private ShaderClosure *sc = &sd->closure[i];
-        if (!CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-          sc->type = NBUILTIN_CLOSURES;
-        }
-      }
-
-      sd->flag &= ~(SD_CLOSURE_FLAGS - (SD_TRANSPARENT | SD_BSDF));
-    }
-    else {
-      weight = one_float3();
-    }
-  }
-  else {
-    for (int i = 0; i < sd->num_closure; i++) {
-      ccl_private const ShaderClosure *sc = &sd->closure[i];
-      if (CLOSURE_IS_HOLDOUT(sc->type)) {
-        weight += sc->weight;
-      }
-    }
-  }
-
-  return weight;
-}
-
-/* Surface Evaluation */
-
-template<uint node_feature_mask, typename ConstIntegratorGenericState>
-ccl_device void shader_eval_surface(KernelGlobals kg,
-                                    ConstIntegratorGenericState state,
-                                    ccl_private ShaderData *ccl_restrict sd,
-                                    ccl_global float *ccl_restrict buffer,
-                                    uint32_t path_flag,
-                                    bool use_caustics_storage = false)
-{
-  /* If path is being terminated, we are tracing a shadow ray or evaluating
-   * emission, then we don't need to store closures. The emission and shadow
-   * shader data also do not have a closure array to save GPU memory. */
-  int max_closures;
-  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
-    max_closures = 0;
-  }
-  else {
-    max_closures = use_caustics_storage ? CAUSTICS_MAX_CLOSURE : kernel_data.max_closures;
-  }
-
-  sd->num_closure = 0;
-  sd->num_closure_left = max_closures;
-
-#ifdef __OSL__
-  if (kg->osl) {
-    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, state, sd, path_flag);
-    }
-    else {
-      OSLShader::eval_surface(kg, state, sd, path_flag);
-    }
-  }
-  else
-#endif
-  {
-#ifdef __SVM__
-    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(kg, state, sd, buffer, path_flag);
-#else
-    if (sd->object == OBJECT_NONE) {
-      sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
-      sd->flag |= SD_EMISSION;
-    }
-    else {
-      ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
-          sd, sizeof(DiffuseBsdf), make_float3(0.8f, 0.8f, 0.8f));
-      if (bsdf != NULL) {
-        bsdf->N = sd->N;
-        sd->flag |= bsdf_diffuse_setup(bsdf);
-      }
-    }
-#endif
-  }
-}
-
-/* Volume */
-
-#ifdef __VOLUME__
-
-ccl_device_inline float _shader_volume_phase_multi_eval(
-    ccl_private const ShaderData *sd,
-    ccl_private const ShaderVolumePhases *phases,
-    const float3 omega_in,
-    int skip_phase,
-    ccl_private BsdfEval *result_eval,
-    float sum_pdf,
-    float sum_sample_weight)
-{
-  for (int i = 0; i < phases->num_closure; i++) {
-    if (i == skip_phase)
-      continue;
-
-    ccl_private const ShaderVolumeClosure *svc = &phases->closure[i];
-    float phase_pdf = 0.0f;
-    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
-
-    if (phase_pdf != 0.0f) {
-      bsdf_eval_accum(result_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
-      sum_pdf += phase_pdf * svc->sample_weight;
-    }
-
-    sum_sample_weight += svc->sample_weight;
-  }
-
-  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-ccl_device float shader_volume_phase_eval(KernelGlobals kg,
-                                          ccl_private const ShaderData *sd,
-                                          ccl_private const ShaderVolumePhases *phases,
-                                          const float3 omega_in,
-                                          ccl_private BsdfEval *phase_eval)
-{
-  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_float3());
-
-  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
-}
-
-ccl_device int shader_volume_phase_sample(KernelGlobals kg,
-                                          ccl_private const ShaderData *sd,
-                                          ccl_private const ShaderVolumePhases *phases,
-                                          float randu,
-                                          float randv,
-                                          ccl_private BsdfEval *phase_eval,
-                                          ccl_private float3 *omega_in,
-                                          ccl_private differential3 *domega_in,
-                                          ccl_private float *pdf)
-{
-  int sampled = 0;
-
-  if (phases->num_closure > 1) {
-    /* pick a phase closure based on sample weights */
-    float sum = 0.0f;
-
-    for (sampled = 0; sampled < phases->num_closure; sampled++) {
-      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
-      sum += svc->sample_weight;
-    }
-
-    float r = randu * sum;
-    float partial_sum = 0.0f;
-
-    for (sampled = 0; sampled < phases->num_closure; sampled++) {
-      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
-      float next_sum = partial_sum + svc->sample_weight;
-
-      if (r <= next_sum) {
-        /* Rescale to reuse for BSDF direction sample. */
-        randu = (r - partial_sum) / svc->sample_weight;
-        break;
-      }
-
-      partial_sum = next_sum;
-    }
-
-    if (sampled == phases->num_closure) {
-      *pdf = 0.0f;
-      return LABEL_NONE;
-    }
-  }
-
-  /* todo: this isn't quite correct, we don't weight anisotropy properly
-   * depending on color channels, even if this is perhaps not a common case */
-  ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
-  }
-
-  return label;
-}
-
-ccl_device int shader_phase_sample_closure(KernelGlobals kg,
-                                           ccl_private const ShaderData *sd,
-                                           ccl_private const ShaderVolumeClosure *sc,
-                                           float randu,
-                                           float randv,
-                                           ccl_private BsdfEval *phase_eval,
-                                           ccl_private float3 *omega_in,
-                                           ccl_private differential3 *domega_in,
-                                           ccl_private float *pdf)
-{
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
-
-  return label;
-}
-
-/* Volume Evaluation */
-
-template<const bool shadow, typename StackReadOp, typename ConstIntegratorGenericState>
-ccl_device_inline void shader_eval_volume(KernelGlobals kg,
-                                          ConstIntegratorGenericState state,
-                                          ccl_private ShaderData *ccl_restrict sd,
-                                          const uint32_t path_flag,
-                                          StackReadOp stack_read)
-{
-  /* If path is being terminated, we are tracing a shadow ray or evaluating
-   * emission, then we don't need to store closures. The emission and shadow
-   * shader data also do not have a closure array to save GPU memory. */
-  int max_closures;
-  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
-    max_closures = 0;
-  }
-  else {
-    max_closures = kernel_data.max_closures;
-  }
-
-  /* reset closures once at the start, we will be accumulating the closures
-   * for all volumes in the stack into a single array of closures */
-  sd->num_closure = 0;
-  sd->num_closure_left = max_closures;
-  sd->flag = 0;
-  sd->object_flag = 0;
-
-  for (int i = 0;; i++) {
-    const VolumeStack entry = stack_read(i);
-    if (entry.shader == SHADER_NONE) {
-      break;
-    }
-
-    /* Setup shader-data from stack. it's mostly setup already in
-     * shader_setup_from_volume, this switching should be quick. */
-    sd->object = entry.object;
-    sd->lamp = LAMP_NONE;
-    sd->shader = entry.shader;
-
-    sd->flag &= ~SD_SHADER_FLAGS;
-    sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-    sd->object_flag &= ~SD_OBJECT_FLAGS;
-
-    if (sd->object != OBJECT_NONE) {
-      sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
-
-#  ifdef __OBJECT_MOTION__
-      /* todo: this is inefficient for motion blur, we should be
-       * caching matrices instead of recomputing them each step */
-      shader_setup_object_transforms(kg, sd, sd->time);
-
-      if ((sd->object_flag & SD_OBJECT_HAS_VOLUME_MOTION) != 0) {
-        AttributeDescriptor v_desc = find_attribute(kg, sd, ATTR_STD_VOLUME_VELOCITY);
-        kernel_assert(v_desc.offset != ATTR_STD_NOT_FOUND);
-
-        const float3 P = sd->P;
-        const float velocity_scale = kernel_tex_fetch(__objects, sd->object).velocity_scale;
-        const float time_offset = kernel_data.cam.motion_position == MOTION_POSITION_CENTER ?
-                                      0.5f :
-                                      0.0f;
-        const float time = kernel_data.cam.motion_position == MOTION_POSITION_END ?
-                               (1.0f - kernel_data.cam.shuttertime) + sd->time :
-                               sd->time;
-
-        /* Use a 1st order semi-lagrangian advection scheme to estimate what volume quantity
-         * existed, or will exist, at the given time:
-         *
-         * `phi(x, T) = phi(x - (T - t) * u(x, T), t)`
-         *
-         * where
-         *
-         * x : position
-         * T : super-sampled time (or ray time)
-         * t : current time of the simulation (in rendering we assume this is center frame with
-         * relative time = 0)
-         * phi : the volume quantity
-         * u : the velocity field
-         *
-         * But first we need to determine the velocity field `u(x, T)`, which we can estimate also
-         * using semi-lagrangian advection.
-         *
-         * `u(x, T) = u(x - (T - t) * u(x, T), t)`
-         *
-         * This is the typical way to model self-advection in fluid dynamics, however, we do not
-         * account for other forces affecting the velocity during simulation (pressure, buoyancy,
-         * etc.): this gives a linear interpolation when fluid are mostly "curvy". For better
-         * results, a higher order interpolation scheme can be used (at the cost of more lookups),
-         * or an interpolation of the velocity fields for the previous and next frames could also
-         * be used to estimate `u(x, T)` (which will cost more memory and lookups).
-         *
-         * References:
-         * "Eulerian Motion Blur", Kim and Ko, 2007
-         * "Production Volume Rendering", Wreninge et al., 2012
-         */
-
-        /* Find velocity. */
-        float3 velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
-        object_dir_transform(kg, sd, &velocity);
-
-        /* Find advected P. */
-        sd->P = P - (time - time_offset) * velocity_scale * velocity;
-
-        /* Find advected velocity. */
-        velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
-        object_dir_transform(kg, sd, &velocity);
-
-        /* Find advected P. */
-        sd->P = P - (time - time_offset) * velocity_scale * velocity;
-      }
-#  endif
-    }
-
-    /* evaluate shader */
-#  ifdef __SVM__
-#    ifdef __OSL__
-    if (kg->osl) {
-      OSLShader::eval_volume(kg, state, sd, path_flag);
-    }
-    else
-#    endif
-    {
-      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
-          kg, state, sd, NULL, path_flag);
-    }
-#  endif
-
-    /* Merge closures to avoid exceeding number of closures limit. */
-    if (!shadow) {
-      if (i > 0) {
-        shader_merge_volume_closures(sd);
-      }
-    }
-  }
-}
-
-#endif /* __VOLUME__ */
-
-/* Displacement Evaluation */
-
-template<typename ConstIntegratorGenericState>
-ccl_device void shader_eval_displacement(KernelGlobals kg,
-                                         ConstIntegratorGenericState state,
-                                         ccl_private ShaderData *sd)
-{
-  sd->num_closure = 0;
-  sd->num_closure_left = 0;
-
-  /* this will modify sd->P */
-#ifdef __SVM__
-#  ifdef __OSL__
-  if (kg->osl)
-    OSLShader::eval_displacement(kg, state, sd);
-  else
-#  endif
-  {
-    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
-        kg, state, sd, NULL, 0);
-  }
-#endif
-}
-
-/* Cryptomatte */
-
-ccl_device float shader_cryptomatte_id(KernelGlobals kg, int shader)
-{
-  return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/shadow_catcher.h b/intern/cycles/kernel/integrator/shadow_catcher.h
index 42d44580f80..a620853faea 100644
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include "kernel/film/write_passes.h"
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/state_util.h"
 
@@ -50,7 +49,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals
 ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
                                                             ConstIntegratorState state)
 {
-  if (INTEGRATOR_PATH_IS_TERMINATED) {
+  if (integrator_path_is_terminated(state)) {
     return false;
   }
 
@@ -76,28 +75,6 @@ ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(const uint32_t
   return path_flag & PATH_RAY_SHADOW_CATCHER_PASS;
 }
 
-/* Write shadow catcher passes on a bounce from the shadow catcher object. */
-ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
-    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
-{
-  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
-  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
-
-  const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                        kernel_data.film.pass_stride;
-  ccl_global float *buffer = render_buffer + render_buffer_offset;
-
-  /* Count sample for the shadow catcher object. */
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
-
-  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
-   * transparency to the matte. */
-  const float3 throughput = INTEGRATOR_STATE(state, path, throughput);
-  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
-                          average(throughput));
-}
-
 #endif /* __SHADOW_CATCHER__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/shadow_state_template.h b/intern/cycles/kernel/integrator/shadow_state_template.h
index eaee65ada40..3b490ecffdd 100644
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -27,15 +27,15 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T
 /* enum PathRayFlag */
 KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput for shadow pass. */
 KERNEL_STRUCT_MEMBER(shadow_path,
-                     packed_float3,
+                     PackedSpectrum,
                      unshadowed_throughput,
                      KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 /* Light group. */
@@ -47,7 +47,8 @@ KERNEL_STRUCT_END(shadow_path)
 KERNEL_STRUCT_BEGIN(shadow_ray)
 KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, int, object, KERNEL_FEATURE_PATH_TRACING)
diff --git a/intern/cycles/kernel/integrator/state.h b/intern/cycles/kernel/integrator/state.h
index d6fef27f344..d1907bd6e16 100644
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -127,6 +127,9 @@ typedef struct IntegratorStateGPU {
 
   /* Index of main path which will be used by a next shadow catcher split.  */
   ccl_global int *next_main_path_index;
+
+  /* Divisor used to partition active indices by locality when sorting by material.  */
+  uint sort_partition_divisor;
 } IntegratorStateGPU;
 
 /* Abstraction
@@ -137,7 +140,7 @@ typedef struct IntegratorStateGPU {
  * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
  * from a kernel which operates on a shadow catcher state will cause bad memory access. */
 
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 
 /* Scalar access on CPU. */
 
@@ -156,7 +159,7 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;
 #  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
     ((state)->nested_struct[array_index].member)
 
-#else /* __KERNEL_CPU__ */
+#else /* !__KERNEL_GPU__ */
 
 /* Array access on GPU with Structure-of-Arrays. */
 
@@ -177,6 +180,6 @@ typedef int ConstIntegratorShadowState;
 #  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
     INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member)
 
-#endif /* __KERNEL_CPU__ */
+#endif /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/state_flow.h b/intern/cycles/kernel/integrator/state_flow.h
index fed74d49434..4b03c665e17 100644
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -10,125 +10,196 @@ CCL_NAMESPACE_BEGIN
 
 /* Control Flow
  *
- * Utilities for control flow between kernels. The implementation may differ per device
- * or even be handled on the host side. To abstract such differences, experiment with
- * different implementations and for debugging, this is abstracted using macros.
+ * Utilities for control flow between kernels. The implementation is different between CPU and
+ * GPU devices. For the latter part of the logic is handled on the host side with wavefronts.
  *
  * There is a main path for regular path tracing camera for path tracing. Shadows for next
  * event estimation branch off from this into their own path, that may be computed in
- * parallel while the main path continues.
+ * parallel while the main path continues. Additionally, shading kernels are sorted using
+ * a key for coherence.
  *
  * Each kernel on the main path must call one of these functions. These may not be called
  * multiple times from the same kernel.
  *
- * INTEGRATOR_PATH_INIT(next_kernel)
- * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
- * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ * integrator_path_init(kg, state, next_kernel)
+ * integrator_path_next(kg, state, current_kernel, next_kernel)
+ * integrator_path_terminate(kg, state, current_kernel)
  *
  * For the shadow path similar functions are used, and again each shadow kernel must call
  * one of them, and only once.
  */
 
-#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(state, path, queued_kernel) == 0)
-#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED \
-  (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0)
+ccl_device_forceinline bool integrator_path_is_terminated(ConstIntegratorState state)
+{
+  return INTEGRATOR_STATE(state, path, queued_kernel) == 0;
+}
+
+ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegratorShadowState state)
+{
+  return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0;
+}
 
 #ifdef __KERNEL_GPU__
 
-#  define INTEGRATOR_PATH_INIT(next_kernel) \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
-    IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
-        &kernel_integrator_state.next_shadow_path_index[0], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
-
-#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
-    { \
-      const int key_ = key; \
-      atomic_fetch_and_add_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \
-      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
-                                  1); \
-    }
-#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
-    { \
-      const int key_ = key; \
-      atomic_fetch_and_sub_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-      atomic_fetch_and_add_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \
-      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
-                                  1); \
-    }
+ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel current_kernel,
+                                                 const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      const DeviceKernel current_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+}
+
+ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
+    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
+{
+  IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32(
+      &kernel_integrator_state.next_shadow_path_index[0], 1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+  return shadow_state;
+}
+
+ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
+                                                        IntegratorShadowState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
+                                                             IntegratorShadowState state,
+                                                             const DeviceKernel current_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+}
+
+/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
+#  define INTEGRATOR_SORT_KEY(key, state) \
+    (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
+
+ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  const int key_ = INTEGRATOR_SORT_KEY(key, state);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
+}
+
+ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  const int key_ = INTEGRATOR_SORT_KEY(key, state);
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
+}
 
 #else
 
-#  define INTEGRATOR_PATH_INIT(next_kernel) \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)key; \
-    }
-#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)key; \
-      (void)current_kernel; \
-    }
-
-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
-    IntegratorShadowState shadow_state = &state->shadow_type; \
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; \
-      (void)current_kernel; \
-    }
+ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)key;
+}
+
+ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel current_kernel,
+                                                 const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      const DeviceKernel current_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)key;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
+    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
+{
+  IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+  return shadow_state;
+}
+
+ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
+                                                        IntegratorShadowState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
+                                                             IntegratorShadowState state,
+                                                             const DeviceKernel current_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+  (void)current_kernel;
+}
 
 #endif
 
diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h
index e7e6db037b0..f4e280e4cb2 100644
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -37,22 +37,21 @@ KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* enum PathRayMNEE */
 KERNEL_STRUCT_MEMBER(path, uint8_t, mnee, KERNEL_FEATURE_PATH_TRACING)
 /* Multiple importance sampling
- * The PDF of BSDF sampling at the last scatter point, and distance to the
- * last scatter point minus the last ray segment. This distance lets us
- * compute the complete distance through transparent surfaces and volumes. */
+ * The PDF of BSDF sampling at the last scatter point, which is at ray distance
+ * zero and distance. Note that transparency and volume attenuation increase
+ * the ray tmin but keep P unmodified so that this works. */
 KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
 /* Filter glossy. */
 KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 /* Continuation probability for path termination. */
 KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
-KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+KERNEL_STRUCT_MEMBER(path, PackedSpectrum, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
 /* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
 KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
@@ -63,7 +62,8 @@ KERNEL_STRUCT_END(path)
 KERNEL_STRUCT_BEGIN(ray)
 KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
@@ -84,8 +84,8 @@ KERNEL_STRUCT_END(isect)
 /*************** Subsurface closure state for subsurface kernel ***************/
 
 KERNEL_STRUCT_BEGIN(subsurface)
-KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, PackedSpectrum, radius, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_END(subsurface)
diff --git a/intern/cycles/kernel/integrator/state_util.h b/intern/cycles/kernel/integrator/state_util.h
index 280db2d1aac..168122d3a78 100644
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -17,7 +17,8 @@ ccl_device_forceinline void integrator_state_write_ray(KernelGlobals kg,
 {
   INTEGRATOR_STATE_WRITE(state, ray, P) = ray->P;
   INTEGRATOR_STATE_WRITE(state, ray, D) = ray->D;
-  INTEGRATOR_STATE_WRITE(state, ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = ray->tmin;
+  INTEGRATOR_STATE_WRITE(state, ray, tmax) = ray->tmax;
   INTEGRATOR_STATE_WRITE(state, ray, time) = ray->time;
   INTEGRATOR_STATE_WRITE(state, ray, dP) = ray->dP;
   INTEGRATOR_STATE_WRITE(state, ray, dD) = ray->dD;
@@ -29,7 +30,8 @@ ccl_device_forceinline void integrator_state_read_ray(KernelGlobals kg,
 {
   ray->P = INTEGRATOR_STATE(state, ray, P);
   ray->D = INTEGRATOR_STATE(state, ray, D);
-  ray->t = INTEGRATOR_STATE(state, ray, t);
+  ray->tmin = INTEGRATOR_STATE(state, ray, tmin);
+  ray->tmax = INTEGRATOR_STATE(state, ray, tmax);
   ray->time = INTEGRATOR_STATE(state, ray, time);
   ray->dP = INTEGRATOR_STATE(state, ray, dP);
   ray->dD = INTEGRATOR_STATE(state, ray, dD);
@@ -42,7 +44,8 @@ ccl_device_forceinline void integrator_state_write_shadow_ray(
 {
   INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray->P;
   INTEGRATOR_STATE_WRITE(state, shadow_ray, D) = ray->D;
-  INTEGRATOR_STATE_WRITE(state, shadow_ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(state, shadow_ray, tmin) = ray->tmin;
+  INTEGRATOR_STATE_WRITE(state, shadow_ray, tmax) = ray->tmax;
   INTEGRATOR_STATE_WRITE(state, shadow_ray, time) = ray->time;
   INTEGRATOR_STATE_WRITE(state, shadow_ray, dP) = ray->dP;
 }
@@ -53,7 +56,8 @@ ccl_device_forceinline void integrator_state_read_shadow_ray(KernelGlobals kg,
 {
   ray->P = INTEGRATOR_STATE(state, shadow_ray, P);
   ray->D = INTEGRATOR_STATE(state, shadow_ray, D);
-  ray->t = INTEGRATOR_STATE(state, shadow_ray, t);
+  ray->tmin = INTEGRATOR_STATE(state, shadow_ray, tmin);
+  ray->tmax = INTEGRATOR_STATE(state, shadow_ray, tmax);
   ray->time = INTEGRATOR_STATE(state, shadow_ray, time);
   ray->dP = INTEGRATOR_STATE(state, shadow_ray, dP);
   ray->dD = differential_zero_compact();
@@ -334,7 +338,7 @@ ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGl
   return to_state;
 }
 
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 ccl_device_inline int integrator_state_bounce(ConstIntegratorState state, const int)
 {
   return INTEGRATOR_STATE(state, path, bounce);
diff --git a/intern/cycles/kernel/integrator/subsurface.h b/intern/cycles/kernel/integrator/subsurface.h
index b449f807290..15c2cb1c708 100644
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -15,9 +15,9 @@
 
 #include "kernel/integrator/intersect_volume_stack.h"
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
 #include "kernel/integrator/subsurface_disk.h"
 #include "kernel/integrator/subsurface_random_walk.h"
+#include "kernel/integrator/surface_shader.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,7 +38,8 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
   /* Setup ray into surface. */
   INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
   INTEGRATOR_STATE_WRITE(state, ray, D) = bssrdf->N;
-  INTEGRATOR_STATE_WRITE(state, ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
   INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
   INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_zero_compact();
 
@@ -50,12 +51,10 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
                                                                  PATH_RAY_SUBSURFACE_RANDOM_WALK);
 
   /* Compute weight, optionally including Fresnel from entry point. */
-  float3 weight = shader_bssrdf_sample_weight(sd, sc);
-#  ifdef __PRINCIPLED__
+  Spectrum weight = surface_shader_bssrdf_sample_weight(sd, sc);
   if (bssrdf->roughness != FLT_MAX) {
     path_flag |= PATH_RAY_SUBSURFACE_USE_FRESNEL;
   }
-#  endif
 
   if (sd->flag & SD_BACKFACING) {
     path_flag |= PATH_RAY_SUBSURFACE_BACKFACING;
@@ -69,8 +68,8 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
 
   if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
     if (INTEGRATOR_STATE(state, path, bounce) == 0) {
-      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_float3();
-      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_float3();
+      INTEGRATOR_STATE_WRITE(state, path, pass_diffuse_weight) = one_spectrum();
+      INTEGRATOR_STATE_WRITE(state, path, pass_glossy_weight) = zero_spectrum();
     }
   }
 
@@ -90,7 +89,7 @@ ccl_device void subsurface_shader_data_setup(KernelGlobals kg,
   /* Get bump mapped normal from shader evaluation at exit point. */
   float3 N = sd->N;
   if (sd->flag & SD_HAS_BSSRDF_BUMP) {
-    N = shader_bssrdf_normal(sd);
+    N = surface_shader_bssrdf_normal(sd);
   }
 
   /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
@@ -98,9 +97,8 @@ ccl_device void subsurface_shader_data_setup(KernelGlobals kg,
   sd->num_closure = 0;
   sd->num_closure_left = kernel_data.max_closures;
 
-  const float3 weight = one_float3();
+  const Spectrum weight = one_spectrum();
 
-#  ifdef __PRINCIPLED__
   if (path_flag & PATH_RAY_SUBSURFACE_USE_FRESNEL) {
     ccl_private PrincipledDiffuseBsdf *bsdf = (ccl_private PrincipledDiffuseBsdf *)bsdf_alloc(
         sd, sizeof(PrincipledDiffuseBsdf), weight);
@@ -111,9 +109,7 @@ ccl_device void subsurface_shader_data_setup(KernelGlobals kg,
       sd->flag |= bsdf_principled_diffuse_setup(bsdf, PRINCIPLED_DIFFUSE_LAMBERT_EXIT);
     }
   }
-  else
-#  endif /* __PRINCIPLED__ */
-  {
+  else {
     ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
         sd, sizeof(DiffuseBsdf), weight);
 
@@ -147,7 +143,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
   /* Update volume stack if needed. */
   if (kernel_data.integrator.use_volumes) {
     const int object = ss_isect.hits[0].object;
-    const int object_flag = kernel_tex_fetch(__object_flag, object);
+    const int object_flag = kernel_data_fetch(object_flag, object);
 
     if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) {
       float3 P = INTEGRATOR_STATE(state, ray, P);
@@ -160,7 +156,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
   /* Pretend ray is coming from the outside towards the exit point. This ensures
    * correct front/back facing normals.
    * TODO: find a more elegant solution? */
-  ray.P += ray.D * ray.t * 2.0f;
+  ray.P += ray.D * ray.tmax * 2.0f;
   ray.D = -ray.D;
 
   integrator_state_write_isect(kg, state, &ss_isect.hits[0]);
@@ -170,24 +166,30 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
   INTEGRATOR_STATE_WRITE(state, path, rng_offset) += PRNG_BOUNCE_NUM;
 
   const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
-  const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+  const int shader_flags = kernel_data_fetch(shaders, shader).flags;
   const int object_flags = intersection_get_object_flags(kg, &ss_isect.hits[0]);
   const bool use_caustics = kernel_data.integrator.use_caustics &&
                             (object_flags & SD_OBJECT_CAUSTICS);
   const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);
 
   if (use_caustics) {
-    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+    integrator_path_next_sorted(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                 DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE,
                                 shader);
   }
   else if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+    integrator_path_next_sorted(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                 DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                 shader);
   }
   else {
-    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+    integrator_path_next_sorted(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                 DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
                                 shader);
   }
diff --git a/intern/cycles/kernel/integrator/subsurface_disk.h b/intern/cycles/kernel/integrator/subsurface_disk.h
index 34330671748..a44b6a74d7b 100644
--- a/intern/cycles/kernel/integrator/subsurface_disk.h
+++ b/intern/cycles/kernel/integrator/subsurface_disk.h
@@ -9,11 +9,11 @@ CCL_NAMESPACE_BEGIN
  * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
  */
 
-ccl_device_inline float3 subsurface_disk_eval(const float3 radius, float disk_r, float r)
+ccl_device_inline Spectrum subsurface_disk_eval(const Spectrum radius, float disk_r, float r)
 {
-  const float3 eval = bssrdf_eval(radius, r);
+  const Spectrum eval = bssrdf_eval(radius, r);
   const float pdf = bssrdf_pdf(radius, disk_r);
-  return (pdf > 0.0f) ? eval / pdf : zero_float3();
+  return (pdf > 0.0f) ? eval / pdf : zero_spectrum();
 }
 
 /* Subsurface scattering step, from a point on the surface to other
@@ -25,8 +25,7 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
                                        ccl_private LocalIntersection &ss_isect)
 
 {
-  float disk_u, disk_v;
-  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &disk_u, &disk_v);
+  float2 rand_disk = path_state_rng_2D(kg, &rng_state, PRNG_SUBSURFACE_DISK);
 
   /* Read shading point info from integrator state. */
   const float3 P = INTEGRATOR_STATE(state, ray, P);
@@ -37,7 +36,7 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
 
   /* Read subsurface scattering parameters. */
-  const float3 radius = INTEGRATOR_STATE(state, subsurface, radius);
+  const Spectrum radius = INTEGRATOR_STATE(state, subsurface, radius);
 
   /* Pick random axis in local frame and point on disk. */
   float3 disk_N, disk_T, disk_B;
@@ -46,20 +45,20 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   disk_N = Ng;
   make_orthonormals(disk_N, &disk_T, &disk_B);
 
-  if (disk_v < 0.5f) {
+  if (rand_disk.y < 0.5f) {
     pick_pdf_N = 0.5f;
     pick_pdf_T = 0.25f;
     pick_pdf_B = 0.25f;
-    disk_v *= 2.0f;
+    rand_disk.y *= 2.0f;
   }
-  else if (disk_v < 0.75f) {
+  else if (rand_disk.y < 0.75f) {
     float3 tmp = disk_N;
     disk_N = disk_T;
     disk_T = tmp;
     pick_pdf_N = 0.25f;
     pick_pdf_T = 0.5f;
     pick_pdf_B = 0.25f;
-    disk_v = (disk_v - 0.5f) * 4.0f;
+    rand_disk.y = (rand_disk.y - 0.5f) * 4.0f;
   }
   else {
     float3 tmp = disk_N;
@@ -68,21 +67,22 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
     pick_pdf_N = 0.25f;
     pick_pdf_T = 0.25f;
     pick_pdf_B = 0.5f;
-    disk_v = (disk_v - 0.75f) * 4.0f;
+    rand_disk.y = (rand_disk.y - 0.75f) * 4.0f;
   }
 
   /* Sample point on disk. */
-  float phi = M_2PI_F * disk_v;
+  float phi = M_2PI_F * rand_disk.y;
   float disk_height, disk_r;
 
-  bssrdf_sample(radius, disk_u, &disk_r, &disk_height);
+  bssrdf_sample(radius, rand_disk.x, &disk_r, &disk_height);
 
   float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
 
   /* Create ray. */
   ray.P = P + disk_N * disk_height + disk_P;
   ray.D = -disk_N;
-  ray.t = 2.0f * disk_height;
+  ray.tmin = 0.0f;
+  ray.tmax = 2.0f * disk_height;
   ray.dP = ray_dP;
   ray.dD = differential_zero_compact();
   ray.time = time;
@@ -107,13 +107,13 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
    * traversal algorithm. */
   sort_intersections_and_normals(ss_isect.hits, ss_isect.Ng, num_eval_hits);
 
-  float3 weights[BSSRDF_MAX_HITS]; /* TODO: zero? */
+  Spectrum weights[BSSRDF_MAX_HITS]; /* TODO: zero? */
   float sum_weights = 0.0f;
 
   for (int hit = 0; hit < num_eval_hits; hit++) {
     /* Get geometric normal. */
     const int object = ss_isect.hits[hit].object;
-    const int object_flag = kernel_tex_fetch(__object_flag, object);
+    const int object_flag = kernel_data_fetch(object_flag, object);
     float3 hit_Ng = ss_isect.Ng[hit];
     if (path_flag & PATH_RAY_SUBSURFACE_BACKFACING) {
       hit_Ng = -hit_Ng;
@@ -125,17 +125,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
     if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
       /* Transform normal to world space. */
       Transform itfm;
-      Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
+      object_fetch_transform_motion_test(kg, object, time, &itfm);
       hit_Ng = normalize(transform_direction_transposed(&itfm, hit_Ng));
-
-      /* Transform t to world space, except for OptiX and MetalRT where it already is. */
-#ifdef __KERNEL_GPU_RAYTRACING__
-      (void)tfm;
-#else
-      float3 D = transform_direction(&itfm, ray.D);
-      D = normalize(D) * ss_isect.hits[hit].t;
-      ss_isect.hits[hit].t = len(transform_direction(&tfm, D));
-#endif
     }
 
     /* Quickly retrieve P and Ng without setting up ShaderData. */
@@ -158,7 +149,7 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
     const float r = len(hit_P - P);
 
     /* Evaluate profiles. */
-    const float3 weight = subsurface_disk_eval(radius, disk_r, r) * w;
+    const Spectrum weight = subsurface_disk_eval(radius, disk_r, r) * w;
 
     /* Store result. */
     ss_isect.Ng[hit] = hit_Ng;
@@ -171,11 +162,12 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
   }
 
   /* Use importance resampling, sampling one of the hits proportional to weight. */
-  const float r = lcg_step_float(&lcg_state) * sum_weights;
+  const float rand_resample = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_DISK_RESAMPLE);
+  const float r = rand_resample * sum_weights;
   float partial_sum = 0.0f;
 
   for (int hit = 0; hit < num_eval_hits; hit++) {
-    const float3 weight = weights[hit];
+    const Spectrum weight = weights[hit];
     const float sample_weight = average(fabs(weight));
     float next_sum = partial_sum + sample_weight;
 
@@ -188,7 +180,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
 
       ray.P = ray.P + ray.D * ss_isect.hits[hit].t;
       ray.D = ss_isect.Ng[hit];
-      ray.t = 1.0f;
+      ray.tmin = 0.0f;
+      ray.tmax = 1.0f;
       return true;
     }
 
diff --git a/intern/cycles/kernel/integrator/subsurface_random_walk.h b/intern/cycles/kernel/integrator/subsurface_random_walk.h
index b6cd4aae195..a6a59e286c9 100644
--- a/intern/cycles/kernel/integrator/subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -65,19 +65,20 @@ ccl_device void subsurface_random_walk_remap(const float albedo,
   *sigma_t = sigma_t_prime / (1.0f - g);
 }
 
-ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
-                                                    const float3 radius,
+ccl_device void subsurface_random_walk_coefficients(const Spectrum albedo,
+                                                    const Spectrum radius,
                                                     const float anisotropy,
-                                                    ccl_private float3 *sigma_t,
-                                                    ccl_private float3 *alpha,
-                                                    ccl_private float3 *throughput)
+                                                    ccl_private Spectrum *sigma_t,
+                                                    ccl_private Spectrum *alpha,
+                                                    ccl_private Spectrum *throughput)
 {
-  float sigma_t_x, sigma_t_y, sigma_t_z;
-  float alpha_x, alpha_y, alpha_z;
-
-  subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x);
-  subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y);
-  subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z);
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    subsurface_random_walk_remap(GET_SPECTRUM_CHANNEL(albedo, i),
+                                 GET_SPECTRUM_CHANNEL(radius, i),
+                                 anisotropy,
+                                 &GET_SPECTRUM_CHANNEL(*sigma_t, i),
+                                 &GET_SPECTRUM_CHANNEL(*alpha, i));
+  }
 
   /* Throughput already contains closure weight at this point, which includes the
    * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo
@@ -88,21 +89,12 @@ ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
    * infinite phase functions. To avoid a sharp discontinuity as we go from
    * such values to 0.0, increase alpha and reduce the throughput to compensate. */
   const float min_alpha = 0.2f;
-  if (alpha_x < min_alpha) {
-    (*throughput).x *= alpha_x / min_alpha;
-    alpha_x = min_alpha;
-  }
-  if (alpha_y < min_alpha) {
-    (*throughput).y *= alpha_y / min_alpha;
-    alpha_y = min_alpha;
-  }
-  if (alpha_z < min_alpha) {
-    (*throughput).z *= alpha_z / min_alpha;
-    alpha_z = min_alpha;
+  FOREACH_SPECTRUM_CHANNEL (i) {
+    if (GET_SPECTRUM_CHANNEL(*alpha, i) < min_alpha) {
+      GET_SPECTRUM_CHANNEL(*throughput, i) *= GET_SPECTRUM_CHANNEL(*alpha, i) / min_alpha;
+      GET_SPECTRUM_CHANNEL(*alpha, i) = min_alpha;
+    }
   }
-
-  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
-  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
 }
 
 /* References for Dwivedi sampling:
@@ -151,12 +143,12 @@ ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, f
   return dir.x * T + dir.y * B + dir.z * D;
 }
 
-ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
-                                                         float t,
-                                                         bool hit,
-                                                         ccl_private float3 *transmittance)
+ccl_device_forceinline Spectrum subsurface_random_walk_pdf(Spectrum sigma_t,
+                                                           float t,
+                                                           bool hit,
+                                                           ccl_private Spectrum *transmittance)
 {
-  float3 T = volume_color_transmittance(sigma_t, t);
+  Spectrum T = volume_color_transmittance(sigma_t, t);
   if (transmittance) {
     *transmittance = T;
   }
@@ -173,8 +165,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
                                               ccl_private Ray &ray,
                                               ccl_private LocalIntersection &ss_isect)
 {
-  float bssrdf_u, bssrdf_v;
-  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+  const float2 rand_bsdf = path_state_rng_2D(kg, &rng_state, PRNG_SUBSURFACE_BSDF);
 
   const float3 P = INTEGRATOR_STATE(state, ray, P);
   const float3 N = INTEGRATOR_STATE(state, ray, D);
@@ -187,7 +178,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   /* Sample diffuse surface scatter into the object. */
   float3 D;
   float pdf;
-  sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf);
+  sample_cos_hemisphere(-N, rand_bsdf.x, rand_bsdf.y, &D, &pdf);
   if (dot(-Ng, D) <= 0.0f) {
     return false;
   }
@@ -195,7 +186,8 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   /* Setup ray. */
   ray.P = P;
   ray.D = D;
-  ray.t = FLT_MAX;
+  ray.tmin = 0.0f;
+  ray.tmax = FLT_MAX;
   ray.time = time;
   ray.dP = ray_dP;
   ray.dD = differential_zero_compact();
@@ -204,22 +196,16 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   ray.self.light_object = OBJECT_NONE;
   ray.self.light_prim = PRIM_NONE;
 
-#ifndef __KERNEL_GPU_RAYTRACING__
-  /* Compute or fetch object transforms. */
-  Transform ob_itfm ccl_optional_struct_init;
-  Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
-#endif
-
   /* Convert subsurface to volume coefficients.
    * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
-  const float3 albedo = INTEGRATOR_STATE(state, subsurface, albedo);
-  const float3 radius = INTEGRATOR_STATE(state, subsurface, radius);
+  const Spectrum albedo = INTEGRATOR_STATE(state, subsurface, albedo);
+  const Spectrum radius = INTEGRATOR_STATE(state, subsurface, radius);
   const float anisotropy = INTEGRATOR_STATE(state, subsurface, anisotropy);
 
-  float3 sigma_t, alpha;
-  float3 throughput = INTEGRATOR_STATE_WRITE(state, path, throughput);
+  Spectrum sigma_t, alpha;
+  Spectrum throughput = INTEGRATOR_STATE_WRITE(state, path, throughput);
   subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput);
-  float3 sigma_s = sigma_t * alpha;
+  Spectrum sigma_s = sigma_t * alpha;
 
   /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
    * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
@@ -229,7 +215,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
    * Since the strength of the guided sampling increases as alpha gets lower, using a value that
    * is too low results in fireflies while one that's too high just gives a bit more noise.
    * Therefore, the code here uses the highest of the three albedos to be safe. */
-  const float diffusion_length = diffusion_length_dwivedi(max3(alpha));
+  const float diffusion_length = diffusion_length_dwivedi(reduce_max(alpha));
 
   if (diffusion_length == 1.0f) {
     /* With specific values of alpha the length might become 1, which in asymptotic makes phase to
@@ -242,7 +228,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f));
 
   /* Modify state for RNGs, decorrelated from other paths. */
-  rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
+  rng_state.rng_hash = hash_hp_seeded_uint(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
 
   /* Random walk until we hit the surface again. */
   bool hit = false;
@@ -254,10 +240,10 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
   const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f));
 
 #ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
-  float3 sigma_s_star = sigma_s * (1.0f - anisotropy);
-  float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star;
-  float3 sigma_t_org = sigma_t;
-  float3 sigma_s_org = sigma_s;
+  Spectrum sigma_s_star = sigma_s * (1.0f - anisotropy);
+  Spectrum sigma_t_star = sigma_t - sigma_s + sigma_s_star;
+  Spectrum sigma_t_org = sigma_t;
+  Spectrum sigma_s_org = sigma_s;
   const float anisotropy_org = anisotropy;
   const float guided_fraction_org = guided_fraction;
 #endif
@@ -269,7 +255,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
 #ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
     // shadow with local variables according to depth
     float anisotropy, guided_fraction;
-    float3 sigma_s, sigma_t;
+    Spectrum sigma_s, sigma_t;
     if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) {
       anisotropy = anisotropy_org;
       guided_fraction = guided_fraction_org;
@@ -285,11 +271,11 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
 #endif
 
     /* Sample color channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL);
-    float3 channel_pdf;
+    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_PHASE_CHANNEL);
+    Spectrum channel_pdf;
     int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
     float sample_sigma_t = volume_channel_get(sigma_t, channel);
-    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE);
+    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_SCATTER_DISTANCE);
 
     /* We need the result of the ray-cast to compute the full guided PDF, so just remember the
      * relevant terms to avoid recomputing them later. */
@@ -302,7 +288,8 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
     /* For the initial ray, we already know the direction, so just do classic distance sampling. */
     if (bounce > 0) {
       /* Decide whether we should use guided or classic sampling. */
-      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction);
+      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_GUIDE_STRATEGY) <
+                     guided_fraction);
 
       /* Determine if we want to sample away from the incoming interface.
        * This only happens if we found a nearby opposite interface, and the probability for it
@@ -316,27 +303,28 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
         float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance);
         backward_fraction = 1.0f /
                             (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length));
-        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction;
+        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_SUBSURFACE_GUIDE_DIRECTION) <
+                         backward_fraction;
       }
 
       /* Sample scattering direction. */
-      float scatter_u, scatter_v;
-      path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+      const float2 rand_scatter = path_state_rng_2D(kg, &rng_state, PRNG_SUBSURFACE_BSDF);
       float cos_theta;
       float hg_pdf;
       if (guided) {
-        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
+        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, rand_scatter.x);
         /* The backwards guiding distribution is just mirrored along `sd->N`, so swapping the
          * sign here is enough to sample from that instead. */
         if (guide_backward) {
           cos_theta = -cos_theta;
         }
-        float3 newD = direction_from_cosine(N, cos_theta, scatter_v);
+        float3 newD = direction_from_cosine(N, cos_theta, rand_scatter.y);
         hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy);
         ray.D = newD;
       }
       else {
-        float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf);
+        float3 newD = henyey_greenstrein_sample(
+            ray.D, anisotropy, rand_scatter.x, rand_scatter.y, &hg_pdf);
         cos_theta = dot(newD, N);
         ray.D = newD;
       }
@@ -370,10 +358,10 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
      * chance of connecting to it.
      * TODO: Maybe use less than 10 times the mean free path? */
     if (bounce == 0) {
-      ray.t = max(t, 10.0f / (min3(sigma_t)));
+      ray.tmax = max(t, 10.0f / (reduce_min(sigma_t)));
     }
     else {
-      ray.t = t;
+      ray.tmax = t;
       /* After the first bounce the object can intersect the same surface again */
       ray.self.object = OBJECT_NONE;
       ray.self.prim = PRIM_NONE;
@@ -382,46 +370,39 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
     hit = (ss_isect.num_hits > 0);
 
     if (hit) {
-#ifdef __KERNEL_GPU_RAYTRACING__
-      /* t is always in world space with OptiX and MetalRT. */
-      ray.t = ss_isect.hits[0].t;
-#else
-      /* Compute world space distance to surface hit. */
-      float3 D = transform_direction(&ob_itfm, ray.D);
-      D = normalize(D) * ss_isect.hits[0].t;
-      ray.t = len(transform_direction(&ob_tfm, D));
-#endif
+      ray.tmax = ss_isect.hits[0].t;
     }
 
     if (bounce == 0) {
       /* Check if we hit the opposite side. */
       if (hit) {
         have_opposite_interface = true;
-        opposite_distance = dot(ray.P + ray.t * ray.D - P, -N);
+        opposite_distance = dot(ray.P + ray.tmax * ray.D - P, -N);
       }
       /* Apart from the opposite side check, we were supposed to only trace up to distance t,
        * so check if there would have been a hit in that case. */
-      hit = ray.t < t;
+      hit = ray.tmax < t;
     }
 
     /* Use the distance to the exit point for the throughput update if we found one. */
     if (hit) {
-      t = ray.t;
+      t = ray.tmax;
     }
 
     /* Advance to new scatter location. */
     ray.P += t * ray.D;
 
-    float3 transmittance;
-    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
+    Spectrum transmittance;
+    Spectrum pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
     if (bounce > 0) {
       /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
-      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
+      Spectrum guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
 
       if (have_opposite_interface) {
         /* First step of MIS: Depending on geometry we might have two methods for guided
          * sampling, so perform MIS between them. */
-        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
+        Spectrum back_pdf = subsurface_random_walk_pdf(
+            backward_stretching * sigma_t, t, hit, NULL);
         guided_pdf = mix(
             guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
       }
@@ -443,16 +424,14 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
       /* If we hit the surface, we are done. */
       break;
     }
-    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
+    else if (reduce_max(throughput) < VOLUME_THROUGHPUT_EPSILON) {
       /* Avoid unnecessary work and precision issue when throughput gets really small. */
       break;
     }
   }
 
   if (hit) {
-    kernel_assert(isfinite3_safe(throughput));
+    kernel_assert(isfinite_safe(throughput));
     INTEGRATOR_STATE_WRITE(state, path, throughput) = throughput;
   }
 
diff --git a/intern/cycles/kernel/integrator/surface_shader.h b/intern/cycles/kernel/integrator/surface_shader.h
new file mode 100644
index 00000000000..64b5556f7e9
--- /dev/null
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -0,0 +1,588 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+/* Functions to evaluate shaders. */
+
+#pragma once
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/emissive.h"
+
+#ifdef __SVM__
+#  include "kernel/svm/svm.h"
+#endif
+#ifdef __OSL__
+#  include "kernel/osl/osl.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void surface_shader_prepare_closures(KernelGlobals kg,
+                                                       ConstIntegratorState state,
+                                                       ccl_private ShaderData *sd,
+                                                       const uint32_t path_flag)
+{
+  /* Filter out closures. */
+  if (kernel_data.integrator.filter_closures) {
+    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_EMISSION) {
+      sd->closure_emission_background = zero_spectrum();
+    }
+
+    if (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIRECT_LIGHT) {
+      sd->flag &= ~SD_BSDF_HAS_EVAL;
+    }
+
+    if (path_flag & PATH_RAY_CAMERA) {
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+
+        if ((CLOSURE_IS_BSDF_DIFFUSE(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_DIFFUSE)) ||
+            (CLOSURE_IS_BSDF_GLOSSY(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_GLOSSY)) ||
+            (CLOSURE_IS_BSDF_TRANSMISSION(sc->type) &&
+             (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSMISSION))) {
+          sc->type = CLOSURE_NONE_ID;
+          sc->sample_weight = 0.0f;
+        }
+        else if ((CLOSURE_IS_BSDF_TRANSPARENT(sc->type) &&
+                  (kernel_data.integrator.filter_closures & FILTER_CLOSURE_TRANSPARENT))) {
+          sc->type = CLOSURE_HOLDOUT_ID;
+          sc->sample_weight = 0.0f;
+          sd->flag |= SD_HOLDOUT;
+        }
+      }
+    }
+  }
+
+  /* Defensive sampling.
+   *
+   * We can likely also do defensive sampling at deeper bounces, particularly
+   * for cases like a perfect mirror but possibly also others. This will need
+   * a good heuristic. */
+  if (INTEGRATOR_STATE(state, path, bounce) + INTEGRATOR_STATE(state, path, transparent_bounce) ==
+          0 &&
+      sd->num_closure > 1) {
+    float sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+      }
+    }
+  }
+
+  /* Filter glossy.
+   *
+   * Blurring of bsdf after bounces, for rays that have a small likelihood
+   * of following this particular path (diffuse, rough glossy) */
+  if (kernel_data.integrator.filter_glossy != FLT_MAX
+#ifdef __MNEE__
+      && !(INTEGRATOR_STATE(state, path, mnee) & PATH_MNEE_VALID)
+#endif
+  ) {
+    float blur_pdf = kernel_data.integrator.filter_glossy *
+                     INTEGRATOR_STATE(state, path, min_ray_pdf);
+
+    if (blur_pdf < 1.0f) {
+      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+        if (CLOSURE_IS_BSDF(sc->type)) {
+          bsdf_blur(kg, sc, blur_roughness);
+        }
+      }
+    }
+  }
+}
+
+/* BSDF */
+
+ccl_device_inline bool surface_shader_is_transmission(ccl_private const ShaderData *sd,
+                                                      const float3 omega_in)
+{
+  return dot(sd->N, omega_in) < 0.0f;
+}
+
+ccl_device_forceinline bool _surface_shader_exclude(ClosureType type, uint light_shader_flags)
+{
+  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+    return false;
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+    if (CLOSURE_IS_BSDF_DIFFUSE(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ccl_device_inline float _surface_shader_bsdf_eval_mis(KernelGlobals kg,
+                                                      ccl_private ShaderData *sd,
+                                                      const float3 omega_in,
+                                                      const bool is_transmission,
+                                                      ccl_private const ShaderClosure *skip_sc,
+                                                      ccl_private BsdfEval *result_eval,
+                                                      float sum_pdf,
+                                                      float sum_sample_weight,
+                                                      const uint light_shader_flags)
+{
+  /* This is the veach one-sample model with balance heuristic,
+   * some PDF factors drop out when using balance heuristic weighting. */
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (sc == skip_sc) {
+      continue;
+    }
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_surface_shader_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        Spectrum eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
+
+        if (bsdf_pdf != 0.0f) {
+          bsdf_eval_accum(result_eval, sc->type, eval * sc->weight);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+        }
+      }
+
+      sum_sample_weight += sc->sample_weight;
+    }
+  }
+
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+}
+
+#ifndef __KERNEL_CUDA__
+ccl_device
+#else
+ccl_device_inline
+#endif
+    float
+    surface_shader_bsdf_eval(KernelGlobals kg,
+                             ccl_private ShaderData *sd,
+                             const float3 omega_in,
+                             const bool is_transmission,
+                             ccl_private BsdfEval *bsdf_eval,
+                             const uint light_shader_flags)
+{
+  bsdf_eval_init(bsdf_eval, CLOSURE_NONE_ID, zero_spectrum());
+
+  return _surface_shader_bsdf_eval_mis(
+      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
+}
+
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline ccl_private const ShaderClosure *surface_shader_bsdf_bssrdf_pick(
+    ccl_private const ShaderData *ccl_restrict sd, ccl_private float2 *rand_bsdf)
+{
+  int sampled = 0;
+
+  if (sd->num_closure > 1) {
+    /* Pick a BSDF or based on sample weights. */
+    float sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+
+    float r = (*rand_bsdf).x * sum;
+    float partial_sum = 0.0f;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        float next_sum = partial_sum + sc->sample_weight;
+
+        if (r < next_sum) {
+          sampled = i;
+
+          /* Rescale to reuse for direction sample, to better preserve stratification. */
+          (*rand_bsdf).x = (r - partial_sum) / sc->sample_weight;
+          break;
+        }
+
+        partial_sum = next_sum;
+      }
+    }
+  }
+
+  return &sd->closure[sampled];
+}
+
+/* Return weight for picked BSSRDF. */
+ccl_device_inline Spectrum
+surface_shader_bssrdf_sample_weight(ccl_private const ShaderData *ccl_restrict sd,
+                                    ccl_private const ShaderClosure *ccl_restrict bssrdf_sc)
+{
+  Spectrum weight = bssrdf_sc->weight;
+
+  if (sd->num_closure > 1) {
+    float sum = 0.0f;
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+        sum += sc->sample_weight;
+      }
+    }
+    weight *= sum / bssrdf_sc->sample_weight;
+  }
+
+  return weight;
+}
+
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int surface_shader_bsdf_sample_closure(KernelGlobals kg,
+                                                  ccl_private ShaderData *sd,
+                                                  ccl_private const ShaderClosure *sc,
+                                                  const float2 rand_bsdf,
+                                                  ccl_private BsdfEval *bsdf_eval,
+                                                  ccl_private float3 *omega_in,
+                                                  ccl_private float *pdf)
+{
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
+
+  int label;
+  Spectrum eval = zero_spectrum();
+
+  *pdf = 0.0f;
+  label = bsdf_sample(kg, sd, sc, rand_bsdf.x, rand_bsdf.y, &eval, omega_in, pdf);
+
+  if (*pdf != 0.0f) {
+    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight);
+
+    if (sd->num_closure > 1) {
+      const bool is_transmission = surface_shader_is_transmission(sd, *omega_in);
+      float sweight = sc->sample_weight;
+      *pdf = _surface_shader_bsdf_eval_mis(
+          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+    }
+  }
+
+  return label;
+}
+
+ccl_device float surface_shader_average_roughness(ccl_private const ShaderData *sd)
+{
+  float roughness = 0.0f;
+  float sum_weight = 0.0f;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF(sc->type)) {
+      /* sqrt once to undo the squaring from multiplying roughness on the
+       * two axes, and once for the squared roughness convention. */
+      float weight = fabsf(average(sc->weight));
+      roughness += weight * sqrtf(safe_sqrtf(bsdf_get_roughness_squared(sc)));
+      sum_weight += weight;
+    }
+  }
+
+  return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
+}
+
+ccl_device Spectrum surface_shader_transparency(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_HAS_ONLY_VOLUME) {
+    return one_spectrum();
+  }
+  else if (sd->flag & SD_TRANSPARENT) {
+    return sd->closure_transparent_extinction;
+  }
+  else {
+    return zero_spectrum();
+  }
+}
+
+ccl_device void surface_shader_disable_transparency(KernelGlobals kg, ccl_private ShaderData *sd)
+{
+  if (sd->flag & SD_TRANSPARENT) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private ShaderClosure *sc = &sd->closure[i];
+
+      if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+        sc->sample_weight = 0.0f;
+        sc->weight = zero_spectrum();
+      }
+    }
+
+    sd->flag &= ~SD_TRANSPARENT;
+  }
+}
+
+ccl_device Spectrum surface_shader_alpha(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum alpha = one_spectrum() - surface_shader_transparency(kg, sd);
+
+  alpha = saturate(alpha);
+
+  return alpha;
+}
+
+ccl_device Spectrum surface_shader_diffuse(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum eval = zero_spectrum();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device Spectrum surface_shader_glossy(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum eval = zero_spectrum();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device Spectrum surface_shader_transmission(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  Spectrum eval = zero_spectrum();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
+      eval += sc->weight;
+  }
+
+  return eval;
+}
+
+ccl_device float3 surface_shader_average_normal(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+      N += sc->N * fabsf(average(sc->weight));
+  }
+
+  return (is_zero(N)) ? sd->N : normalize(N);
+}
+
+ccl_device Spectrum surface_shader_ao(KernelGlobals kg,
+                                      ccl_private const ShaderData *sd,
+                                      const float ao_factor,
+                                      ccl_private float3 *N_)
+{
+  Spectrum eval = zero_spectrum();
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
+      ccl_private const DiffuseBsdf *bsdf = (ccl_private const DiffuseBsdf *)sc;
+      eval += sc->weight * ao_factor;
+      N += bsdf->N * fabsf(average(sc->weight));
+    }
+  }
+
+  *N_ = (is_zero(N)) ? sd->N : normalize(N);
+  return eval;
+}
+
+#ifdef __SUBSURFACE__
+ccl_device float3 surface_shader_bssrdf_normal(ccl_private const ShaderData *sd)
+{
+  float3 N = zero_float3();
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *sc = &sd->closure[i];
+
+    if (CLOSURE_IS_BSSRDF(sc->type)) {
+      ccl_private const Bssrdf *bssrdf = (ccl_private const Bssrdf *)sc;
+      float avg_weight = fabsf(average(sc->weight));
+
+      N += bssrdf->N * avg_weight;
+    }
+  }
+
+  return (is_zero(N)) ? sd->N : normalize(N);
+}
+#endif /* __SUBSURFACE__ */
+
+/* Constant emission optimization */
+
+ccl_device bool surface_shader_constant_emission(KernelGlobals kg,
+                                                 int shader,
+                                                 ccl_private Spectrum *eval)
+{
+  int shader_index = shader & SHADER_MASK;
+  int shader_flag = kernel_data_fetch(shaders, shader_index).flags;
+
+  if (shader_flag & SD_HAS_CONSTANT_EMISSION) {
+    const float3 emission_rgb = make_float3(
+        kernel_data_fetch(shaders, shader_index).constant_emission[0],
+        kernel_data_fetch(shaders, shader_index).constant_emission[1],
+        kernel_data_fetch(shaders, shader_index).constant_emission[2]);
+    *eval = rgb_to_spectrum(emission_rgb);
+
+    return true;
+  }
+
+  return false;
+}
+
+/* Background */
+
+ccl_device Spectrum surface_shader_background(ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_EMISSION) {
+    return sd->closure_emission_background;
+  }
+  else {
+    return zero_spectrum();
+  }
+}
+
+/* Emission */
+
+ccl_device Spectrum surface_shader_emission(ccl_private const ShaderData *sd)
+{
+  if (sd->flag & SD_EMISSION) {
+    return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
+  }
+  else {
+    return zero_spectrum();
+  }
+}
+
+/* Holdout */
+
+ccl_device Spectrum surface_shader_apply_holdout(KernelGlobals kg, ccl_private ShaderData *sd)
+{
+  Spectrum weight = zero_spectrum();
+
+  /* For objects marked as holdout, preserve transparency and remove all other
+   * closures, replacing them with a holdout weight. */
+  if (sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    if ((sd->flag & SD_TRANSPARENT) && !(sd->flag & SD_HAS_ONLY_VOLUME)) {
+      weight = one_spectrum() - sd->closure_transparent_extinction;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ccl_private ShaderClosure *sc = &sd->closure[i];
+        if (!CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+          sc->type = NBUILTIN_CLOSURES;
+        }
+      }
+
+      sd->flag &= ~(SD_CLOSURE_FLAGS - (SD_TRANSPARENT | SD_BSDF));
+    }
+    else {
+      weight = one_spectrum();
+    }
+  }
+  else {
+    for (int i = 0; i < sd->num_closure; i++) {
+      ccl_private const ShaderClosure *sc = &sd->closure[i];
+      if (CLOSURE_IS_HOLDOUT(sc->type)) {
+        weight += sc->weight;
+      }
+    }
+  }
+
+  return weight;
+}
+
+/* Surface Evaluation */
+
+template<uint node_feature_mask, typename ConstIntegratorGenericState>
+ccl_device void surface_shader_eval(KernelGlobals kg,
+                                    ConstIntegratorGenericState state,
+                                    ccl_private ShaderData *ccl_restrict sd,
+                                    ccl_global float *ccl_restrict buffer,
+                                    uint32_t path_flag,
+                                    bool use_caustics_storage = false)
+{
+  /* If path is being terminated, we are tracing a shadow ray or evaluating
+   * emission, then we don't need to store closures. The emission and shadow
+   * shader data also do not have a closure array to save GPU memory. */
+  int max_closures;
+  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+    max_closures = 0;
+  }
+  else {
+    max_closures = use_caustics_storage ? CAUSTICS_MAX_CLOSURE : kernel_data.max_closures;
+  }
+
+  sd->num_closure = 0;
+  sd->num_closure_left = max_closures;
+
+#ifdef __OSL__
+  if (kg->osl) {
+    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+      OSLShader::eval_background(kg, state, sd, path_flag);
+    }
+    else {
+      OSLShader::eval_surface(kg, state, sd, path_flag);
+    }
+  }
+  else
+#endif
+  {
+#ifdef __SVM__
+    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(kg, state, sd, buffer, path_flag);
+#else
+    if (sd->object == OBJECT_NONE) {
+      sd->closure_emission_background = make_spectrum(0.8f);
+      sd->flag |= SD_EMISSION;
+    }
+    else {
+      ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
+          sd, sizeof(DiffuseBsdf), make_spectrum(0.8f));
+      if (bsdf != NULL) {
+        bsdf->N = sd->N;
+        sd->flag |= bsdf_diffuse_setup(bsdf);
+      }
+    }
+#endif
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/volume_shader.h b/intern/cycles/kernel/integrator/volume_shader.h
new file mode 100644
index 00000000000..31039bfdcf5
--- /dev/null
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@@ -0,0 +1,354 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+/* Volume shader evaluation and sampling. */
+
+#pragma once
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/emissive.h"
+
+#ifdef __SVM__
+#  include "kernel/svm/svm.h"
+#endif
+#ifdef __OSL__
+#  include "kernel/osl/osl.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Merging */
+ccl_device_inline void volume_shader_merge_closures(ccl_private ShaderData *sd)
+{
+  /* Merge identical closures to save closure space with stacked volumes. */
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private ShaderClosure *sci = &sd->closure[i];
+
+    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      continue;
+    }
+
+    for (int j = i + 1; j < sd->num_closure; j++) {
+      ccl_private ShaderClosure *scj = &sd->closure[j];
+      if (sci->type != scj->type) {
+        continue;
+      }
+
+      ccl_private const HenyeyGreensteinVolume *hgi = (ccl_private const HenyeyGreensteinVolume *)
+          sci;
+      ccl_private const HenyeyGreensteinVolume *hgj = (ccl_private const HenyeyGreensteinVolume *)
+          scj;
+      if (!(hgi->g == hgj->g)) {
+        continue;
+      }
+
+      sci->weight += scj->weight;
+      sci->sample_weight += scj->sample_weight;
+
+      int size = sd->num_closure - (j + 1);
+      if (size > 0) {
+        for (int k = 0; k < size; k++) {
+          scj[k] = scj[k + 1];
+        }
+      }
+
+      sd->num_closure--;
+      kernel_assert(sd->num_closure >= 0);
+      j--;
+    }
+  }
+}
+
+ccl_device_inline void volume_shader_copy_phases(ccl_private ShaderVolumePhases *ccl_restrict
+                                                     phases,
+                                                 ccl_private const ShaderData *ccl_restrict sd)
+{
+  phases->num_closure = 0;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    ccl_private const ShaderClosure *from_sc = &sd->closure[i];
+    ccl_private const HenyeyGreensteinVolume *from_hg =
+        (ccl_private const HenyeyGreensteinVolume *)from_sc;
+
+    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      ccl_private ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+      to_sc->weight = from_sc->weight;
+      to_sc->sample_weight = from_sc->sample_weight;
+      to_sc->g = from_hg->g;
+      phases->num_closure++;
+      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+        break;
+      }
+    }
+  }
+}
+
+ccl_device_inline float _volume_shader_phase_eval_mis(ccl_private const ShaderData *sd,
+                                                      ccl_private const ShaderVolumePhases *phases,
+                                                      const float3 omega_in,
+                                                      int skip_phase,
+                                                      ccl_private BsdfEval *result_eval,
+                                                      float sum_pdf,
+                                                      float sum_sample_weight)
+{
+  for (int i = 0; i < phases->num_closure; i++) {
+    if (i == skip_phase)
+      continue;
+
+    ccl_private const ShaderVolumeClosure *svc = &phases->closure[i];
+    float phase_pdf = 0.0f;
+    Spectrum eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
+
+    if (phase_pdf != 0.0f) {
+      bsdf_eval_accum(result_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+      sum_pdf += phase_pdf * svc->sample_weight;
+    }
+
+    sum_sample_weight += svc->sample_weight;
+  }
+
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+}
+
+ccl_device float volume_shader_phase_eval(KernelGlobals kg,
+                                          ccl_private const ShaderData *sd,
+                                          ccl_private const ShaderVolumePhases *phases,
+                                          const float3 omega_in,
+                                          ccl_private BsdfEval *phase_eval)
+{
+  bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, zero_spectrum());
+
+  return _volume_shader_phase_eval_mis(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
+}
+
+ccl_device int volume_shader_phase_sample(KernelGlobals kg,
+                                          ccl_private const ShaderData *sd,
+                                          ccl_private const ShaderVolumePhases *phases,
+                                          float2 rand_phase,
+                                          ccl_private BsdfEval *phase_eval,
+                                          ccl_private float3 *omega_in,
+                                          ccl_private float *pdf)
+{
+  int sampled = 0;
+
+  if (phases->num_closure > 1) {
+    /* pick a phase closure based on sample weights */
+    float sum = 0.0f;
+
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      sum += svc->sample_weight;
+    }
+
+    float r = rand_phase.x * sum;
+    float partial_sum = 0.0f;
+
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      float next_sum = partial_sum + svc->sample_weight;
+
+      if (r <= next_sum) {
+        /* Rescale to reuse for BSDF direction sample. */
+        rand_phase.x = (r - partial_sum) / svc->sample_weight;
+        break;
+      }
+
+      partial_sum = next_sum;
+    }
+
+    if (sampled == phases->num_closure) {
+      *pdf = 0.0f;
+      return LABEL_NONE;
+    }
+  }
+
+  /* todo: this isn't quite correct, we don't weight anisotropy properly
+   * depending on color channels, even if this is perhaps not a common case */
+  ccl_private const ShaderVolumeClosure *svc = &phases->closure[sampled];
+  int label;
+  Spectrum eval = zero_spectrum();
+
+  *pdf = 0.0f;
+  label = volume_phase_sample(sd, svc, rand_phase.x, rand_phase.y, &eval, omega_in, pdf);
+
+  if (*pdf != 0.0f) {
+    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+  }
+
+  return label;
+}
+
+ccl_device int volume_shader_phase_sample_closure(KernelGlobals kg,
+                                                  ccl_private const ShaderData *sd,
+                                                  ccl_private const ShaderVolumeClosure *sc,
+                                                  const float2 rand_phase,
+                                                  ccl_private BsdfEval *phase_eval,
+                                                  ccl_private float3 *omega_in,
+                                                  ccl_private float *pdf)
+{
+  int label;
+  Spectrum eval = zero_spectrum();
+
+  *pdf = 0.0f;
+  label = volume_phase_sample(sd, sc, rand_phase.x, rand_phase.y, &eval, omega_in, pdf);
+
+  if (*pdf != 0.0f)
+    bsdf_eval_init(phase_eval, CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID, eval);
+
+  return label;
+}
+
+/* Motion Blur */
+
+#  ifdef __OBJECT_MOTION__
+ccl_device_inline void volume_shader_motion_blur(KernelGlobals kg,
+                                                 ccl_private ShaderData *ccl_restrict sd)
+{
+  if ((sd->object_flag & SD_OBJECT_HAS_VOLUME_MOTION) == 0) {
+    return;
+  }
+
+  AttributeDescriptor v_desc = find_attribute(kg, sd, ATTR_STD_VOLUME_VELOCITY);
+  kernel_assert(v_desc.offset != ATTR_STD_NOT_FOUND);
+
+  const float3 P = sd->P;
+  const float velocity_scale = kernel_data_fetch(objects, sd->object).velocity_scale;
+  const float time_offset = kernel_data.cam.motion_position == MOTION_POSITION_CENTER ? 0.5f :
+                                                                                        0.0f;
+  const float time = kernel_data.cam.motion_position == MOTION_POSITION_END ?
+                         (1.0f - kernel_data.cam.shuttertime) + sd->time :
+                         sd->time;
+
+  /* Use a 1st order semi-lagrangian advection scheme to estimate what volume quantity
+   * existed, or will exist, at the given time:
+   *
+   * `phi(x, T) = phi(x - (T - t) * u(x, T), t)`
+   *
+   * where
+   *
+   * x : position
+   * T : super-sampled time (or ray time)
+   * t : current time of the simulation (in rendering we assume this is center frame with
+   * relative time = 0)
+   * phi : the volume quantity
+   * u : the velocity field
+   *
+   * But first we need to determine the velocity field `u(x, T)`, which we can estimate also
+   * using semi-lagrangian advection.
+   *
+   * `u(x, T) = u(x - (T - t) * u(x, T), t)`
+   *
+   * This is the typical way to model self-advection in fluid dynamics, however, we do not
+   * account for other forces affecting the velocity during simulation (pressure, buoyancy,
+   * etc.): this gives a linear interpolation when fluid are mostly "curvy". For better
+   * results, a higher order interpolation scheme can be used (at the cost of more lookups),
+   * or an interpolation of the velocity fields for the previous and next frames could also
+   * be used to estimate `u(x, T)` (which will cost more memory and lookups).
+   *
+   * References:
+   * "Eulerian Motion Blur", Kim and Ko, 2007
+   * "Production Volume Rendering", Wreninge et al., 2012
+   */
+
+  /* Find velocity. */
+  float3 velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
+  object_dir_transform(kg, sd, &velocity);
+
+  /* Find advected P. */
+  sd->P = P - (time - time_offset) * velocity_scale * velocity;
+
+  /* Find advected velocity. */
+  velocity = primitive_volume_attribute_float3(kg, sd, v_desc);
+  object_dir_transform(kg, sd, &velocity);
+
+  /* Find advected P. */
+  sd->P = P - (time - time_offset) * velocity_scale * velocity;
+}
+#  endif
+
+/* Volume Evaluation */
+
+template<const bool shadow, typename StackReadOp, typename ConstIntegratorGenericState>
+ccl_device_inline void volume_shader_eval(KernelGlobals kg,
+                                          ConstIntegratorGenericState state,
+                                          ccl_private ShaderData *ccl_restrict sd,
+                                          const uint32_t path_flag,
+                                          StackReadOp stack_read)
+{
+  /* If path is being terminated, we are tracing a shadow ray or evaluating
+   * emission, then we don't need to store closures. The emission and shadow
+   * shader data also do not have a closure array to save GPU memory. */
+  int max_closures;
+  if (path_flag & (PATH_RAY_TERMINATE | PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+    max_closures = 0;
+  }
+  else {
+    max_closures = kernel_data.max_closures;
+  }
+
+  /* reset closures once at the start, we will be accumulating the closures
+   * for all volumes in the stack into a single array of closures */
+  sd->num_closure = 0;
+  sd->num_closure_left = max_closures;
+  sd->flag = 0;
+  sd->object_flag = 0;
+
+  for (int i = 0;; i++) {
+    const VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    /* Setup shader-data from stack. it's mostly setup already in
+     * shader_setup_from_volume, this switching should be quick. */
+    sd->object = entry.object;
+    sd->lamp = LAMP_NONE;
+    sd->shader = entry.shader;
+
+    sd->flag &= ~SD_SHADER_FLAGS;
+    sd->flag |= kernel_data_fetch(shaders, (sd->shader & SHADER_MASK)).flags;
+    sd->object_flag &= ~SD_OBJECT_FLAGS;
+
+    if (sd->object != OBJECT_NONE) {
+      sd->object_flag |= kernel_data_fetch(object_flag, sd->object);
+
+#  ifdef __OBJECT_MOTION__
+      /* todo: this is inefficient for motion blur, we should be
+       * caching matrices instead of recomputing them each step */
+      shader_setup_object_transforms(kg, sd, sd->time);
+
+      volume_shader_motion_blur(kg, sd);
+#  endif
+    }
+
+    /* evaluate shader */
+#  ifdef __OSL__
+    if (kg->osl) {
+      OSLShader::eval_volume(kg, state, sd, path_flag);
+    }
+    else
+#  endif
+    {
+#  ifdef __SVM__
+      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+          kg, state, sd, NULL, path_flag);
+#  endif
+    }
+
+    /* Merge closures to avoid exceeding number of closures limit. */
+    if (!shadow) {
+      if (i > 0) {
+        volume_shader_merge_closures(sd);
+      }
+    }
+  }
+}
+
+#endif /* __VOLUME__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/volume_stack.h b/intern/cycles/kernel/integrator/volume_stack.h
index 5256349a0cc..675e1927fc0 100644
--- a/intern/cycles/kernel/integrator/volume_stack.h
+++ b/intern/cycles/kernel/integrator/volume_stack.h
@@ -39,7 +39,7 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
         break;
       }
 
-      if (entry.object == sd->object) {
+      if (entry.object == sd->object && entry.shader == sd->shader) {
         /* Shift back next stack entries. */
         do {
           entry = stack_read(i + 1);
@@ -61,7 +61,7 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
       }
 
       /* Already in the stack? then we have nothing to do. */
-      if (entry.object == sd->object) {
+      if (entry.object == sd->object && entry.shader == sd->shader) {
         return;
       }
     }
@@ -133,7 +133,7 @@ ccl_device float volume_stack_step_size(KernelGlobals kg, StackReadOp stack_read
       break;
     }
 
-    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+    int shader_flag = kernel_data_fetch(shaders, (entry.shader & SHADER_MASK)).flags;
 
     bool heterogeneous = false;
 
@@ -146,7 +146,7 @@ ccl_device float volume_stack_step_size(KernelGlobals kg, StackReadOp stack_read
        * heterogeneous volume objects may be using the same shader. */
       int object = entry.object;
       if (object != OBJECT_NONE) {
-        int object_flag = kernel_tex_fetch(__object_flag, object);
+        int object_flag = kernel_data_fetch(object_flag, object);
         if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
           heterogeneous = true;
         }
@@ -180,7 +180,7 @@ ccl_device VolumeSampleMethod volume_stack_sample_method(KernelGlobals kg, Integ
       break;
     }
 
-    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+    int shader_flag = kernel_data_fetch(shaders, (entry.shader & SHADER_MASK)).flags;
 
     if (shader_flag & SD_VOLUME_MIS) {
       /* Multiple importance sampling. */
diff --git a/intern/cycles/kernel/light/background.h b/intern/cycles/kernel/light/background.h
index 0cbf7fb76fe..951620ff1cb 100644
--- a/intern/cycles/kernel/light/background.h
+++ b/intern/cycles/kernel/light/background.h
@@ -9,8 +9,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Background Light */
 
-#ifdef __BACKGROUND_MIS__
-
 ccl_device float3 background_map_sample(KernelGlobals kg,
                                         float randu,
                                         float randv,
@@ -31,7 +29,7 @@ ccl_device float3 background_map_sample(KernelGlobals kg,
     int step = count >> 1;
     int middle = first + step;
 
-    if (kernel_tex_fetch(__light_background_marginal_cdf, middle).y < randv) {
+    if (kernel_data_fetch(light_background_marginal_cdf, middle).y < randv) {
       first = middle + 1;
       count -= step + 1;
     }
@@ -42,9 +40,9 @@ ccl_device float3 background_map_sample(KernelGlobals kg,
   int index_v = max(0, first - 1);
   kernel_assert(index_v >= 0 && index_v < res_y);
 
-  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
-  float2 cdf_next_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v + 1);
-  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
+  float2 cdf_v = kernel_data_fetch(light_background_marginal_cdf, index_v);
+  float2 cdf_next_v = kernel_data_fetch(light_background_marginal_cdf, index_v + 1);
+  float2 cdf_last_v = kernel_data_fetch(light_background_marginal_cdf, res_y);
 
   /* importance-sampled V direction */
   float dv = inverse_lerp(cdf_v.y, cdf_next_v.y, randv);
@@ -57,7 +55,7 @@ ccl_device float3 background_map_sample(KernelGlobals kg,
     int step = count >> 1;
     int middle = first + step;
 
-    if (kernel_tex_fetch(__light_background_conditional_cdf, index_v * cdf_width + middle).y <
+    if (kernel_data_fetch(light_background_conditional_cdf, index_v * cdf_width + middle).y <
         randu) {
       first = middle + 1;
       count -= step + 1;
@@ -69,12 +67,12 @@ ccl_device float3 background_map_sample(KernelGlobals kg,
   int index_u = max(0, first - 1);
   kernel_assert(index_u >= 0 && index_u < res_x);
 
-  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                  index_v * cdf_width + index_u);
-  float2 cdf_next_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + index_u + 1);
-  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + res_x);
+  float2 cdf_u = kernel_data_fetch(light_background_conditional_cdf,
+                                   index_v * cdf_width + index_u);
+  float2 cdf_next_u = kernel_data_fetch(light_background_conditional_cdf,
+                                        index_v * cdf_width + index_u + 1);
+  float2 cdf_last_u = kernel_data_fetch(light_background_conditional_cdf,
+                                        index_v * cdf_width + res_x);
 
   /* importance-sampled U direction */
   float du = inverse_lerp(cdf_u.y, cdf_next_u.y, randu);
@@ -112,9 +110,9 @@ ccl_device float background_map_pdf(KernelGlobals kg, float3 direction)
   int index_v = clamp(float_to_int(uv.y * res_y), 0, res_y - 1);
 
   /* pdfs in V direction */
-  float2 cdf_last_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                       index_v * cdf_width + res_x);
-  float2 cdf_last_v = kernel_tex_fetch(__light_background_marginal_cdf, res_y);
+  float2 cdf_last_u = kernel_data_fetch(light_background_conditional_cdf,
+                                        index_v * cdf_width + res_x);
+  float2 cdf_last_v = kernel_data_fetch(light_background_marginal_cdf, res_y);
 
   float denom = (M_2PI_F * M_PI_F * sin_theta) * cdf_last_u.x * cdf_last_v.x;
 
@@ -122,9 +120,9 @@ ccl_device float background_map_pdf(KernelGlobals kg, float3 direction)
     return 0.0f;
 
   /* pdfs in U direction */
-  float2 cdf_u = kernel_tex_fetch(__light_background_conditional_cdf,
-                                  index_v * cdf_width + index_u);
-  float2 cdf_v = kernel_tex_fetch(__light_background_marginal_cdf, index_v);
+  float2 cdf_u = kernel_data_fetch(light_background_conditional_cdf,
+                                   index_v * cdf_width + index_u);
+  float2 cdf_v = kernel_data_fetch(light_background_marginal_cdf, index_v);
 
   return (cdf_u.x * cdf_v.x) / denom;
 }
@@ -133,7 +131,7 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(
     KernelGlobals kg, float3 P, int index, ccl_private float3 *lightpos, ccl_private float3 *dir)
 {
   int portal = kernel_data.background.portal_offset + index;
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+  const ccl_global KernelLight *klight = &kernel_data_fetch(lights, portal);
 
   *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]);
   *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
@@ -166,7 +164,7 @@ ccl_device_inline float background_portal_pdf(
     num_possible++;
 
     int portal = kernel_data.background.portal_offset + p;
-    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, portal);
     float3 axisu = make_float3(
         klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
     float3 axisv = make_float3(
@@ -242,7 +240,7 @@ ccl_device float3 background_portal_sample(KernelGlobals kg,
     if (portal == 0) {
       /* p is the portal to be sampled. */
       int portal = kernel_data.background.portal_offset + p;
-      const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
+      const ccl_global KernelLight *klight = &kernel_data_fetch(lights, portal);
       float3 axisu = make_float3(
           klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
       float3 axisv = make_float3(
@@ -435,6 +433,4 @@ ccl_device float background_light_pdf(KernelGlobals kg, float3 P, float3 directi
   return pdf * kernel_data.integrator.pdf_lights;
 }
 
-#endif
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/light/light.h b/intern/cycles/kernel/light/light.h
index 1df1615ed99..12a6f21b58d 100644
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -38,7 +38,7 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
                                     const uint32_t path_flag,
                                     ccl_private LightSample *ls)
 {
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  const ccl_global KernelLight *klight = &kernel_data_fetch(lights, lamp);
   if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
     if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
       return false;
@@ -86,7 +86,6 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
     ls->pdf = invarea / (costheta * costheta * costheta);
     ls->eval_fac = ls->pdf;
   }
-#ifdef __BACKGROUND_MIS__
   else if (type == LIGHT_BACKGROUND) {
     /* infinite area light (e.g. light dome or env light) */
     float3 D = -background_light_sample(kg, P, randu, randv, &ls->pdf);
@@ -97,7 +96,6 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
     ls->t = FLT_MAX;
     ls->eval_fac = 1.0f;
   }
-#endif
   else {
     ls->P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
@@ -202,8 +200,12 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
         inplane = ls->P - inplane;
       }
 
-      ls->u = dot(inplane, axisu) * (1.0f / dot(axisu, axisu)) + 0.5f;
-      ls->v = dot(inplane, axisv) * (1.0f / dot(axisv, axisv)) + 0.5f;
+      const float light_u = dot(inplane, axisu) * (1.0f / dot(axisu, axisu));
+      const float light_v = dot(inplane, axisv) * (1.0f / dot(axisv, axisv));
+
+      /* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */
+      ls->u = light_v + 0.5f;
+      ls->v = -light_u - light_v;
 
       ls->Ng = Ng;
       ls->D = normalize_len(ls->P - P, &ls->t);
@@ -237,7 +239,7 @@ ccl_device bool lights_intersect(KernelGlobals kg,
                                  const uint32_t path_flag)
 {
   for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
-    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, lamp);
 
     if (path_flag & PATH_RAY_CAMERA) {
       if (klight->shader_id & SHADER_EXCLUDE_CAMERA) {
@@ -270,31 +272,26 @@ ccl_device bool lights_intersect(KernelGlobals kg,
 
     if (type == LIGHT_SPOT) {
       /* Spot/Disk light. */
-      const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
-      const float3 ray_P = ray->P - ray->D * mis_ray_t;
-
       const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
       const float radius = klight->spot.radius;
       if (radius == 0.0f) {
         continue;
       }
       /* disk oriented normal */
-      const float3 lightN = normalize(ray_P - lightP);
+      const float3 lightN = normalize(ray->P - lightP);
       /* One sided. */
       if (dot(ray->D, lightN) >= 0.0f) {
         continue;
       }
 
       float3 P;
-      if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lightN, radius, &P, &t)) {
+      if (!ray_disk_intersect(
+              ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) {
         continue;
       }
     }
     else if (type == LIGHT_POINT) {
       /* Sphere light (aka, aligned disk light). */
-      const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
-      const float3 ray_P = ray->P - ray->D * mis_ray_t;
-
       const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
       const float radius = klight->spot.radius;
       if (radius == 0.0f) {
@@ -302,9 +299,10 @@ ccl_device bool lights_intersect(KernelGlobals kg,
       }
 
       /* disk oriented normal */
-      const float3 lightN = normalize(ray_P - lightP);
+      const float3 lightN = normalize(ray->P - lightP);
       float3 P;
-      if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lightN, radius, &P, &t)) {
+      if (!ray_disk_intersect(
+              ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) {
         continue;
       }
     }
@@ -330,8 +328,19 @@ ccl_device bool lights_intersect(KernelGlobals kg,
       const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
       float3 P;
-      if (!ray_quad_intersect(
-              ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) {
+      if (!ray_quad_intersect(ray->P,
+                              ray->D,
+                              ray->tmin,
+                              ray->tmax,
+                              light_P,
+                              axisu,
+                              axisv,
+                              Ng,
+                              &P,
+                              &t,
+                              &u,
+                              &v,
+                              is_round)) {
         continue;
       }
     }
@@ -358,7 +367,7 @@ ccl_device bool light_sample_from_distant_ray(KernelGlobals kg,
                                               const int lamp,
                                               ccl_private LightSample *ccl_restrict ls)
 {
-  ccl_global const KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  ccl_global const KernelLight *klight = &kernel_data_fetch(lights, lamp);
   const int shader = klight->shader_id;
   const float radius = klight->distant.radius;
   const LightType type = (LightType)klight->type;
@@ -433,7 +442,7 @@ ccl_device bool light_sample_from_intersection(KernelGlobals kg,
                                                ccl_private LightSample *ccl_restrict ls)
 {
   const int lamp = isect->prim;
-  ccl_global const KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  ccl_global const KernelLight *klight = &kernel_data_fetch(lights, lamp);
   LightType type = (LightType)klight->type;
   ls->type = type;
   ls->shader = klight->shader_id;
@@ -562,7 +571,7 @@ ccl_device_inline bool triangle_world_space_vertices(
     KernelGlobals kg, int object, int prim, float time, float3 V[3])
 {
   bool has_motion = false;
-  const int object_flag = kernel_tex_fetch(__object_flag, object);
+  const int object_flag = kernel_data_fetch(object_flag, object);
 
   if (object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) {
     motion_triangle_vertices(kg, object, prim, time, V);
@@ -699,12 +708,12 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg,
   float area = 0.5f * Nl;
 
   /* flip normal if necessary */
-  const int object_flag = kernel_tex_fetch(__object_flag, object);
+  const int object_flag = kernel_data_fetch(object_flag, object);
   if (object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
     ls->Ng = -ls->Ng;
   }
   ls->eval_fac = 1.0f;
-  ls->shader = kernel_tex_fetch(__tri_shader, prim);
+  ls->shader = kernel_data_fetch(tri_shader, prim);
   ls->object = object;
   ls->prim = prim;
   ls->lamp = LAMP_NONE;
@@ -775,7 +784,8 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg,
     ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);
 
     /* calculate intersection with the planar triangle */
-    if (!ray_triangle_intersect(P, ls->D, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
+    if (!ray_triangle_intersect(
+            P, ls->D, 0.0f, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
       ls->pdf = 0.0f;
       return;
     }
@@ -845,7 +855,7 @@ ccl_device int light_distribution_sample(KernelGlobals kg, ccl_private float *ra
     int half_len = len >> 1;
     int middle = first + half_len;
 
-    if (r < kernel_tex_fetch(__light_distribution, middle).totarea) {
+    if (r < kernel_data_fetch(light_distribution, middle).totarea) {
       len = half_len;
     }
     else {
@@ -860,8 +870,8 @@ ccl_device int light_distribution_sample(KernelGlobals kg, ccl_private float *ra
 
   /* Rescale to reuse random number. this helps the 2D samples within
    * each area light be stratified as well. */
-  float distr_min = kernel_tex_fetch(__light_distribution, index).totarea;
-  float distr_max = kernel_tex_fetch(__light_distribution, index + 1).totarea;
+  float distr_min = kernel_data_fetch(light_distribution, index).totarea;
+  float distr_max = kernel_data_fetch(light_distribution, index + 1).totarea;
   *randu = (r - distr_min) / (distr_max - distr_min);
 
   return index;
@@ -871,7 +881,7 @@ ccl_device int light_distribution_sample(KernelGlobals kg, ccl_private float *ra
 
 ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals kg, int index, int bounce)
 {
-  return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
+  return (bounce > kernel_data_fetch(lights, index).max_bounces);
 }
 
 template<bool in_volume_segment>
@@ -886,8 +896,8 @@ ccl_device_noinline bool light_distribution_sample(KernelGlobals kg,
 {
   /* Sample light index from distribution. */
   const int index = light_distribution_sample(kg, &randu);
-  ccl_global const KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution,
-                                                                              index);
+  ccl_global const KernelLightDistribution *kdistribution = &kernel_data_fetch(light_distribution,
+                                                                               index);
   const int prim = kdistribution->prim;
 
   if (prim >= 0) {
@@ -896,7 +906,7 @@ ccl_device_noinline bool light_distribution_sample(KernelGlobals kg,
 
     /* Exclude synthetic meshes from shadow catcher pass. */
     if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
-        !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
+        !(kernel_data_fetch(object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
       return false;
     }
 
diff --git a/intern/cycles/kernel/light/sample.h b/intern/cycles/kernel/light/sample.h
index 9bbbd5b0d10..e0d4f221bef 100644
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
+#include "kernel/integrator/surface_shader.h"
 
 #include "kernel/light/light.h"
 
@@ -14,7 +14,7 @@
 CCL_NAMESPACE_BEGIN
 
 /* Evaluate shader on light. */
-ccl_device_noinline_cpu float3
+ccl_device_noinline_cpu Spectrum
 light_sample_shader_eval(KernelGlobals kg,
                          IntegratorState state,
                          ccl_private ShaderData *ccl_restrict emission_sd,
@@ -22,24 +22,21 @@ light_sample_shader_eval(KernelGlobals kg,
                          float time)
 {
   /* setup shading at emitter */
-  float3 eval = zero_float3();
+  Spectrum eval = zero_spectrum();
 
-  if (shader_constant_emission_eval(kg, ls->shader, &eval)) {
+  if (surface_shader_constant_emission(kg, ls->shader, &eval)) {
     if ((ls->prim != PRIM_NONE) && dot(ls->Ng, ls->D) > 0.0f) {
       ls->Ng = -ls->Ng;
     }
   }
   else {
-    /* Setup shader data and call shader_eval_surface once, better
+    /* Setup shader data and call surface_shader_eval once, better
      * for GPU coherence and compile times. */
     PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
-#ifdef __BACKGROUND_MIS__
     if (ls->type == LIGHT_BACKGROUND) {
       shader_setup_from_background(kg, emission_sd, ls->P, ls->D, time);
     }
-    else
-#endif
-    {
+    else {
       shader_setup_from_sample(kg,
                                emission_sd,
                                ls->P,
@@ -63,26 +60,24 @@ light_sample_shader_eval(KernelGlobals kg,
 
     /* No proper path flag, we're evaluating this for all closures. that's
      * weak but we'd have to do multiple evaluations otherwise. */
-    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+    surface_shader_eval<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
         kg, state, emission_sd, NULL, PATH_RAY_EMISSION);
 
     /* Evaluate closures. */
-#ifdef __BACKGROUND_MIS__
     if (ls->type == LIGHT_BACKGROUND) {
-      eval = shader_background_eval(emission_sd);
+      eval = surface_shader_background(emission_sd);
     }
-    else
-#endif
-    {
-      eval = shader_emissive_eval(emission_sd);
+    else {
+      eval = surface_shader_emission(emission_sd);
     }
   }
 
   eval *= ls->eval_fac;
 
   if (ls->lamp != LAMP_NONE) {
-    ccl_global const KernelLight *klight = &kernel_tex_fetch(__lights, ls->lamp);
-    eval *= make_float3(klight->strength[0], klight->strength[1], klight->strength[2]);
+    ccl_global const KernelLight *klight = &kernel_data_fetch(lights, ls->lamp);
+    eval *= rgb_to_spectrum(
+        make_float3(klight->strength[0], klight->strength[1], klight->strength[2]));
   }
 
   return eval;
@@ -106,7 +101,7 @@ ccl_device_inline bool light_sample_terminate(KernelGlobals kg,
   }
 
   if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    float probability = max3(fabs(bsdf_eval_sum(eval))) *
+    float probability = reduce_max(fabs(bsdf_eval_sum(eval))) *
                         kernel_data.integrator.light_inv_rr_threshold;
     if (probability < 1.0f) {
       if (rand_terminate >= probability) {
@@ -137,8 +132,9 @@ ccl_device_inline float3 shadow_ray_smooth_surface_offset(
     triangle_vertices_and_normals(kg, sd->prim, V, N);
   }
 
-  const float u = sd->u, v = sd->v;
-  const float w = 1 - u - v;
+  const float u = 1.0f - sd->u - sd->v;
+  const float v = sd->u;
+  const float w = sd->v;
   float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
   float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
 
@@ -187,7 +183,7 @@ ccl_device_inline float3 shadow_ray_offset(KernelGlobals kg,
 
   if ((sd->type & PRIMITIVE_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
     const float offset_cutoff =
-        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
+        kernel_data_fetch(objects, sd->object).shadow_terminator_geometry_offset;
     /* Do ray offset (heavy stuff) only for close to be terminated triangles:
      * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
      * make a smooth transition near the threshold. */
@@ -227,23 +223,24 @@ ccl_device_inline void shadow_ray_setup(ccl_private const ShaderData *ccl_restri
   if (ls->shader & SHADER_CAST_SHADOW) {
     /* setup ray */
     ray->P = P;
+    ray->tmin = 0.0f;
 
     if (ls->t == FLT_MAX) {
       /* distant light */
       ray->D = ls->D;
-      ray->t = ls->t;
+      ray->tmax = ls->t;
     }
     else {
       /* other lights, avoid self-intersection */
       ray->D = ls->P - P;
-      ray->D = normalize_len(ray->D, &ray->t);
+      ray->D = normalize_len(ray->D, &ray->tmax);
     }
   }
   else {
     /* signal to not cast shadow ray */
     ray->P = zero_float3();
     ray->D = zero_float3();
-    ray->t = 0.0f;
+    ray->tmax = 0.0f;
   }
 
   ray->dP = differential_make_compact(sd->dP);
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 7570490be7c..5075e4e1528 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -10,21 +10,18 @@ set(INC_SYS
 )
 
 set(SRC
-  background.cpp
-  bsdf_diffuse_ramp.cpp
-  bsdf_phong_ramp.cpp
-  emissive.cpp
-  bssrdf.cpp
   closures.cpp
+  globals.cpp
   services.cpp
-  shader.cpp
 )
 
 set(HEADER_SRC
-  closures.h
+  closures_setup.h
+  closures_template.h
   globals.h
+  osl.h
   services.h
-  shader.h
+  types.h
 )
 
 set(LIB
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
deleted file mode 100644
index 865ff4ddc6d..00000000000
--- a/intern/cycles/kernel/osl/background.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- *
- * Adapted from Open Shading Language
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011-2022 Blender Foundation. */
-
-#include <OpenImageIO/fmath.h>
-
-#include <OSL/genclosure.h>
-
-#include "kernel/osl/closures.h"
-
-// clang-format off
-#include "kernel/device/cpu/compat.h"
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/emissive.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-using namespace OSL;
-
-/// Generic background closure
-///
-/// We only have a background closure for the shaders
-/// to return a color in background shaders. No methods,
-/// only the weight is taking into account
-///
-class GenericBackgroundClosure : public CClosurePrimitive {
- public:
-  void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight)
-  {
-    background_setup(sd, weight);
-  }
-};
-
-/// Holdout closure
-///
-/// This will be used by the shader to mark the
-/// amount of holdout for the current shading
-/// point. No parameters, only the weight will be
-/// used
-///
-class HoldoutClosure : CClosurePrimitive {
- public:
-  void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight)
-  {
-    closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, weight);
-    sd->flag |= SD_HOLDOUT;
-  }
-};
-
-ClosureParam *closure_background_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_STRING_KEYPARAM(GenericBackgroundClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(GenericBackgroundClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_background_prepare, GenericBackgroundClosure)
-
-ClosureParam *closure_holdout_params()
-{
-  static ClosureParam params[] = {CLOSURE_FINISH_PARAM(HoldoutClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_holdout_prepare, HoldoutClosure)
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
deleted file mode 100644
index 39fcee1ac0d..00000000000
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- *
- * Adapted from Open Shading Language
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011-2022 Blender Foundation. */
-
-#include <OpenImageIO/fmath.h>
-
-#include <OSL/genclosure.h>
-
-#include "kernel/device/cpu/compat.h"
-#include "kernel/osl/closures.h"
-
-// clang-format off
-#include "kernel/types.h"
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/bsdf_diffuse_ramp.h"
-#include "kernel/closure/bsdf_util.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-using namespace OSL;
-
-class DiffuseRampClosure : public CBSDFClosure {
- public:
-  DiffuseRampBsdf params;
-  Color3 colors[8];
-
-  void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    DiffuseRampBsdf *bsdf = (DiffuseRampBsdf *)bsdf_alloc_osl(
-        sd, sizeof(DiffuseRampBsdf), weight, &params);
-
-    if (bsdf) {
-      bsdf->colors = (float3 *)closure_alloc_extra(sd, sizeof(float3) * 8);
-
-      if (bsdf->colors) {
-        for (int i = 0; i < 8; i++)
-          bsdf->colors[i] = TO_FLOAT3(colors[i]);
-
-        sd->flag |= bsdf_diffuse_ramp_setup(bsdf);
-      }
-    }
-  }
-};
-
-ClosureParam *closure_bsdf_diffuse_ramp_params()
-{
-  static ClosureParam params[] = {CLOSURE_FLOAT3_PARAM(DiffuseRampClosure, params.N),
-                                  CLOSURE_COLOR_ARRAY_PARAM(DiffuseRampClosure, colors, 8),
-                                  CLOSURE_STRING_KEYPARAM(DiffuseRampClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(DiffuseRampClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_bsdf_diffuse_ramp_prepare, DiffuseRampClosure)
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
deleted file mode 100644
index 972ed7e4a6d..00000000000
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- *
- * Adapted from Open Shading Language
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011-2022 Blender Foundation. */
-
-#include <OpenImageIO/fmath.h>
-
-#include <OSL/genclosure.h>
-
-#include "kernel/device/cpu/compat.h"
-#include "kernel/osl/closures.h"
-
-// clang-format off
-#include "kernel/types.h"
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/bsdf_phong_ramp.h"
-#include "kernel/closure/bsdf_util.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-using namespace OSL;
-
-class PhongRampClosure : public CBSDFClosure {
- public:
-  PhongRampBsdf params;
-  Color3 colors[8];
-
-  void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    PhongRampBsdf *bsdf = (PhongRampBsdf *)bsdf_alloc_osl(
-        sd, sizeof(PhongRampBsdf), weight, &params);
-
-    if (bsdf) {
-      bsdf->colors = (float3 *)closure_alloc_extra(sd, sizeof(float3) * 8);
-
-      if (bsdf->colors) {
-        for (int i = 0; i < 8; i++)
-          bsdf->colors[i] = TO_FLOAT3(colors[i]);
-
-        sd->flag |= bsdf_phong_ramp_setup(bsdf);
-      }
-    }
-  }
-};
-
-ClosureParam *closure_bsdf_phong_ramp_params()
-{
-  static ClosureParam params[] = {CLOSURE_FLOAT3_PARAM(PhongRampClosure, params.N),
-                                  CLOSURE_FLOAT_PARAM(PhongRampClosure, params.exponent),
-                                  CLOSURE_COLOR_ARRAY_PARAM(PhongRampClosure, colors, 8),
-                                  CLOSURE_STRING_KEYPARAM(PhongRampClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(PhongRampClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_bsdf_phong_ramp_prepare, PhongRampClosure)
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/bssrdf.cpp b/intern/cycles/kernel/osl/bssrdf.cpp
deleted file mode 100644
index 4b282fddad3..00000000000
--- a/intern/cycles/kernel/osl/bssrdf.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- *
- * Adapted from Open Shading Language
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011-2022 Blender Foundation. */
-
-#include <OSL/genclosure.h>
-
-#include "kernel/device/cpu/compat.h"
-#include "kernel/osl/closures.h"
-
-// clang-format off
-#include "kernel/types.h"
-
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/bsdf_util.h"
-#include "kernel/closure/bsdf_diffuse.h"
-#include "kernel/closure/bsdf_principled_diffuse.h"
-#include "kernel/closure/bssrdf.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-using namespace OSL;
-
-static ustring u_burley("burley");
-static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
-static ustring u_random_walk("random_walk");
-
-class CBSSRDFClosure : public CClosurePrimitive {
- public:
-  Bssrdf params;
-  float ior;
-  ustring method;
-
-  CBSSRDFClosure()
-  {
-    params.roughness = FLT_MAX;
-    params.anisotropy = 1.0f;
-    ior = 1.4f;
-  }
-
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    if (method == u_burley) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
-    }
-    else if (method == u_random_walk_fixed_radius) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
-    }
-    else if (method == u_random_walk) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID);
-    }
-  }
-
-  void alloc(ShaderData *sd, uint32_t path_flag, float3 weight, ClosureType type)
-  {
-    Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
-
-    if (bssrdf) {
-      /* disable in case of diffuse ancestor, can't see it well then and
-       * adds considerably noise due to probabilities of continuing path
-       * getting lower and lower */
-      if (path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
-        params.radius = make_float3(0.0f, 0.0f, 0.0f);
-      }
-
-      /* create one closure per color channel */
-      bssrdf->radius = params.radius;
-      bssrdf->albedo = params.albedo;
-      bssrdf->N = params.N;
-      bssrdf->roughness = params.roughness;
-      bssrdf->anisotropy = clamp(params.anisotropy, 0.0f, 0.9f);
-      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, clamp(ior, 1.01f, 3.8f));
-    }
-  }
-};
-
-ClosureParam *closure_bssrdf_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_STRING_PARAM(CBSSRDFClosure, method),
-      CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N),
-      CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius),
-      CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, ior, "ior"),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.anisotropy, "anisotropy"),
-      CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(CBSSRDFClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_bssrdf_prepare, CBSSRDFClosure)
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/closures.cpp b/intern/cycles/kernel/osl/closures.cpp
index 7c6b48154e4..d56e0551a91 100644
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -9,997 +9,304 @@
 #include <OSL/genclosure.h>
 #include <OSL/oslclosure.h>
 
-#include "kernel/osl/closures.h"
-#include "kernel/osl/shader.h"
+#include "kernel/types.h"
+
+#include "kernel/osl/globals.h"
+#include "kernel/osl/services.h"
 
 #include "util/math.h"
 #include "util/param.h"
 
-// clang-format off
 #include "kernel/device/cpu/compat.h"
 #include "kernel/device/cpu/globals.h"
 
-#include "kernel/types.h"
-
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/bsdf_util.h"
-#include "kernel/closure/bsdf_ashikhmin_velvet.h"
-#include "kernel/closure/bsdf_diffuse.h"
-#include "kernel/closure/bsdf_microfacet.h"
-#include "kernel/closure/bsdf_microfacet_multi.h"
-#include "kernel/closure/bsdf_oren_nayar.h"
-#include "kernel/closure/bsdf_reflection.h"
-#include "kernel/closure/bsdf_refraction.h"
-#include "kernel/closure/bsdf_transparent.h"
-#include "kernel/closure/bsdf_ashikhmin_shirley.h"
-#include "kernel/closure/bsdf_toon.h"
-#include "kernel/closure/bsdf_hair.h"
-#include "kernel/closure/bsdf_hair_principled.h"
-#include "kernel/closure/bsdf_principled_diffuse.h"
-#include "kernel/closure/bsdf_principled_sheen.h"
-#include "kernel/closure/volume.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-using namespace OSL;
-
-/* BSDF class definitions */
-
-BSDF_CLOSURE_CLASS_BEGIN(Diffuse, diffuse, DiffuseBsdf, LABEL_DIFFUSE)
-  BSDF_CLOSURE_FLOAT3_PARAM(DiffuseClosure, params.N)
-BSDF_CLOSURE_CLASS_END(Diffuse, diffuse)
-
-BSDF_CLOSURE_CLASS_BEGIN(Translucent, translucent, DiffuseBsdf, LABEL_DIFFUSE)
-  BSDF_CLOSURE_FLOAT3_PARAM(TranslucentClosure, params.N)
-BSDF_CLOSURE_CLASS_END(Translucent, translucent)
-
-BSDF_CLOSURE_CLASS_BEGIN(OrenNayar, oren_nayar, OrenNayarBsdf, LABEL_DIFFUSE)
-  BSDF_CLOSURE_FLOAT3_PARAM(OrenNayarClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(OrenNayarClosure, params.roughness)
-BSDF_CLOSURE_CLASS_END(OrenNayar, oren_nayar)
-
-BSDF_CLOSURE_CLASS_BEGIN(Reflection, reflection, MicrofacetBsdf, LABEL_SINGULAR)
-  BSDF_CLOSURE_FLOAT3_PARAM(ReflectionClosure, params.N)
-BSDF_CLOSURE_CLASS_END(Reflection, reflection)
-
-BSDF_CLOSURE_CLASS_BEGIN(Refraction, refraction, MicrofacetBsdf, LABEL_SINGULAR)
-  BSDF_CLOSURE_FLOAT3_PARAM(RefractionClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(RefractionClosure, params.ior)
-BSDF_CLOSURE_CLASS_END(Refraction, refraction)
-
-BSDF_CLOSURE_CLASS_BEGIN(AshikhminVelvet, ashikhmin_velvet, VelvetBsdf, LABEL_DIFFUSE)
-  BSDF_CLOSURE_FLOAT3_PARAM(AshikhminVelvetClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(AshikhminVelvetClosure, params.sigma)
-BSDF_CLOSURE_CLASS_END(AshikhminVelvet, ashikhmin_velvet)
-
-BSDF_CLOSURE_CLASS_BEGIN(AshikhminShirley,
-                         ashikhmin_shirley,
-                         MicrofacetBsdf,
-                         LABEL_GLOSSY | LABEL_REFLECT)
-  BSDF_CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, params.N)
-  BSDF_CLOSURE_FLOAT3_PARAM(AshikhminShirleyClosure, params.T)
-  BSDF_CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, params.alpha_x)
-  BSDF_CLOSURE_FLOAT_PARAM(AshikhminShirleyClosure, params.alpha_y)
-BSDF_CLOSURE_CLASS_END(AshikhminShirley, ashikhmin_shirley)
-
-BSDF_CLOSURE_CLASS_BEGIN(DiffuseToon, diffuse_toon, ToonBsdf, LABEL_DIFFUSE)
-  BSDF_CLOSURE_FLOAT3_PARAM(DiffuseToonClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(DiffuseToonClosure, params.size)
-  BSDF_CLOSURE_FLOAT_PARAM(DiffuseToonClosure, params.smooth)
-BSDF_CLOSURE_CLASS_END(DiffuseToon, diffuse_toon)
-
-BSDF_CLOSURE_CLASS_BEGIN(GlossyToon, glossy_toon, ToonBsdf, LABEL_GLOSSY)
-  BSDF_CLOSURE_FLOAT3_PARAM(GlossyToonClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(GlossyToonClosure, params.size)
-  BSDF_CLOSURE_FLOAT_PARAM(GlossyToonClosure, params.smooth)
-BSDF_CLOSURE_CLASS_END(GlossyToon, glossy_toon)
-
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXIsotropic,
-                         microfacet_ggx_isotropic,
-                         MicrofacetBsdf,
-                         LABEL_GLOSSY | LABEL_REFLECT)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetGGXIsotropicClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetGGXIsotropicClosure, params.alpha_x)
-BSDF_CLOSURE_CLASS_END(MicrofacetGGXIsotropic, microfacet_ggx_isotropic)
-
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGX,
-                         microfacet_ggx,
-                         MicrofacetBsdf,
-                         LABEL_GLOSSY | LABEL_REFLECT)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, params.N)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetGGXClosure, params.T)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, params.alpha_x)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetGGXClosure, params.alpha_y)
-BSDF_CLOSURE_CLASS_END(MicrofacetGGX, microfacet_ggx)
-
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannIsotropic,
-                         microfacet_beckmann_isotropic,
-                         MicrofacetBsdf,
-                         LABEL_GLOSSY | LABEL_REFLECT)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannIsotropicClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetBeckmannIsotropicClosure, params.alpha_x)
-BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannIsotropic, microfacet_beckmann_isotropic)
-
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmann,
-                         microfacet_beckmann,
-                         MicrofacetBsdf,
-                         LABEL_GLOSSY | LABEL_REFLECT)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, params.N)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannClosure, params.T)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, params.alpha_x)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetBeckmannClosure, params.alpha_y)
-BSDF_CLOSURE_CLASS_END(MicrofacetBeckmann, microfacet_beckmann)
-
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetGGXRefraction,
-                         microfacet_ggx_refraction,
-                         MicrofacetBsdf,
-                         LABEL_GLOSSY | LABEL_TRANSMIT)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetGGXRefractionClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, params.alpha_x)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetGGXRefractionClosure, params.ior)
-BSDF_CLOSURE_CLASS_END(MicrofacetGGXRefraction, microfacet_ggx_refraction)
-
-BSDF_CLOSURE_CLASS_BEGIN(MicrofacetBeckmannRefraction,
-                         microfacet_beckmann_refraction,
-                         MicrofacetBsdf,
-                         LABEL_GLOSSY | LABEL_TRANSMIT)
-  BSDF_CLOSURE_FLOAT3_PARAM(MicrofacetBeckmannRefractionClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, params.alpha_x)
-  BSDF_CLOSURE_FLOAT_PARAM(MicrofacetBeckmannRefractionClosure, params.ior)
-BSDF_CLOSURE_CLASS_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
-
-BSDF_CLOSURE_CLASS_BEGIN(HairReflection, hair_reflection, HairBsdf, LABEL_GLOSSY)
-  BSDF_CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness1)
-  BSDF_CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.roughness2)
-  BSDF_CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T)
-  BSDF_CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.offset)
-BSDF_CLOSURE_CLASS_END(HairReflection, hair_reflection)
-
-BSDF_CLOSURE_CLASS_BEGIN(HairTransmission, hair_transmission, HairBsdf, LABEL_GLOSSY)
-  BSDF_CLOSURE_FLOAT3_PARAM(HairTransmissionClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness1)
-  BSDF_CLOSURE_FLOAT_PARAM(HairTransmissionClosure, params.roughness2)
-  BSDF_CLOSURE_FLOAT3_PARAM(HairReflectionClosure, params.T)
-  BSDF_CLOSURE_FLOAT_PARAM(HairReflectionClosure, params.offset)
-BSDF_CLOSURE_CLASS_END(HairTransmission, hair_transmission)
-
-BSDF_CLOSURE_CLASS_BEGIN(PrincipledDiffuse,
-                         principled_diffuse,
-                         PrincipledDiffuseBsdf,
-                         LABEL_DIFFUSE)
-  BSDF_CLOSURE_FLOAT3_PARAM(PrincipledDiffuseClosure, params.N)
-  BSDF_CLOSURE_FLOAT_PARAM(PrincipledDiffuseClosure, params.roughness)
-BSDF_CLOSURE_CLASS_END(PrincipledDiffuse, principled_diffuse)
-
-class PrincipledSheenClosure : public CBSDFClosure {
- public:
-  PrincipledSheenBsdf params;
-
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    if (!skip(sd, path_flag, LABEL_DIFFUSE)) {
-      params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-      PrincipledSheenBsdf *bsdf = (PrincipledSheenBsdf *)bsdf_alloc_osl(
-          sd, sizeof(PrincipledSheenBsdf), weight, &params);
-      sd->flag |= (bsdf) ? bsdf_principled_sheen_setup(sd, bsdf) : 0;
-    }
-  }
-};
-
-static ClosureParam *bsdf_principled_sheen_params()
-{
-  static ClosureParam params[] = {CLOSURE_FLOAT3_PARAM(PrincipledSheenClosure, params.N),
-                                  CLOSURE_STRING_KEYPARAM(PrincipledSheenClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(PrincipledSheenClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE_STATIC(closure_bsdf_principled_sheen_prepare, PrincipledSheenClosure)
-
-/* PRINCIPLED HAIR BSDF */
-class PrincipledHairClosure : public CBSDFClosure {
- public:
-  PrincipledHairBSDF params;
-
-  PrincipledHairBSDF *alloc(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)bsdf_alloc_osl(
-        sd, sizeof(PrincipledHairBSDF), weight, &params);
-    if (!bsdf) {
-      return NULL;
-    }
-
-    PrincipledHairExtra *extra = (PrincipledHairExtra *)closure_alloc_extra(
-        sd, sizeof(PrincipledHairExtra));
-    if (!extra) {
-      return NULL;
-    }
-
-    bsdf->extra = extra;
-    return bsdf;
-  }
+#include "kernel/geom/object.h"
+#include "kernel/util/differential.h"
 
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    if (!skip(sd, path_flag, LABEL_GLOSSY)) {
-      params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+#include "kernel/osl/osl.h"
 
-      PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)alloc(sd, path_flag, weight);
-      if (!bsdf) {
-        return;
-      }
-
-      sd->flag |= (bsdf) ? bsdf_principled_hair_setup(sd, bsdf) : 0;
-    }
-  }
-};
-
-static ClosureParam *closure_bsdf_principled_hair_params()
-{
-  static ClosureParam params[] = {CLOSURE_FLOAT3_PARAM(PrincipledHairClosure, params.N),
-                                  CLOSURE_FLOAT3_PARAM(PrincipledHairClosure, params.sigma),
-                                  CLOSURE_FLOAT_PARAM(PrincipledHairClosure, params.v),
-                                  CLOSURE_FLOAT_PARAM(PrincipledHairClosure, params.s),
-                                  CLOSURE_FLOAT_PARAM(PrincipledHairClosure, params.m0_roughness),
-                                  CLOSURE_FLOAT_PARAM(PrincipledHairClosure, params.alpha),
-                                  CLOSURE_FLOAT_PARAM(PrincipledHairClosure, params.eta),
-                                  CLOSURE_STRING_KEYPARAM(PrincipledHairClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(PrincipledHairClosure)};
-
-  return params;
-}
+#include "kernel/osl/closures_setup.h"
 
-CCLOSURE_PREPARE(closure_bsdf_principled_hair_prepare, PrincipledHairClosure)
+#define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z)
+#define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])
 
-/* DISNEY PRINCIPLED CLEARCOAT */
-class PrincipledClearcoatClosure : public CBSDFClosure {
- public:
-  MicrofacetBsdf params;
-  float clearcoat, clearcoat_roughness;
-
-  MicrofacetBsdf *alloc(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
-        sd, sizeof(MicrofacetBsdf), weight, &params);
-    if (!bsdf) {
-      return NULL;
-    }
-
-    MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-    if (!extra) {
-      return NULL;
-    }
-
-    bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->extra = extra;
-    bsdf->ior = 1.5f;
-    bsdf->alpha_x = clearcoat_roughness;
-    bsdf->alpha_y = clearcoat_roughness;
-    bsdf->extra->color = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
-    bsdf->extra->clearcoat = clearcoat;
-    return bsdf;
-  }
+CCL_NAMESPACE_BEGIN
 
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
+/* Registration */
 
-    sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  static OSL::ClosureParam *osl_closure_##lower##_params() \
+  { \
+    static OSL::ClosureParam params[] = {
+#define OSL_CLOSURE_STRUCT_END(Upper, lower) \
+  CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), CLOSURE_FINISH_PARAM(Upper##Closure) \
+  } \
+  ; \
+  return params; \
   }
-};
+#define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) \
+  CLOSURE_##TYPE##_KEYPARAM(Upper##Closure, name, key),
+#define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) \
+  CLOSURE_##TYPE##_ARRAY_PARAM(Upper##Closure, name, size),
 
-ClosureParam *closure_bsdf_principled_clearcoat_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(PrincipledClearcoatClosure, params.N),
-      CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat),
-      CLOSURE_FLOAT_PARAM(PrincipledClearcoatClosure, clearcoat_roughness),
-      CLOSURE_STRING_KEYPARAM(PrincipledClearcoatClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(PrincipledClearcoatClosure)};
-  return params;
-}
-CCLOSURE_PREPARE(closure_bsdf_principled_clearcoat_prepare, PrincipledClearcoatClosure)
-
-/* Registration */
+#include "closures_template.h"
 
-static void register_closure(OSL::ShadingSystem *ss,
-                             const char *name,
-                             int id,
-                             OSL::ClosureParam *params,
-                             OSL::PrepareClosureFunc prepare)
+void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
 {
-  /* optimization: it's possible to not use a prepare function at all and
-   * only initialize the actual class when accessing the closure component
-   * data, but then we need to map the id to the class somehow */
-#if OSL_LIBRARY_VERSION_CODE >= 10900
-  ss->register_closure(name, id, params, prepare, NULL);
-#else
-  ss->register_closure(name, id, params, prepare, NULL, 16);
-#endif
-}
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  ss->register_closure( \
+      #lower, OSL_CLOSURE_##Upper##_ID, osl_closure_##lower##_params(), nullptr, nullptr);
 
-void OSLShader::register_closures(OSLShadingSystem *ss_)
-{
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)ss_;
-  int id = 0;
-
-  register_closure(ss, "diffuse", id++, bsdf_diffuse_params(), bsdf_diffuse_prepare);
-  register_closure(ss, "oren_nayar", id++, bsdf_oren_nayar_params(), bsdf_oren_nayar_prepare);
-  register_closure(ss, "translucent", id++, bsdf_translucent_params(), bsdf_translucent_prepare);
-  register_closure(ss, "reflection", id++, bsdf_reflection_params(), bsdf_reflection_prepare);
-  register_closure(ss, "refraction", id++, bsdf_refraction_params(), bsdf_refraction_prepare);
-  register_closure(ss,
-                   "transparent",
-                   id++,
-                   closure_bsdf_transparent_params(),
-                   closure_bsdf_transparent_prepare);
-
-  register_closure(
-      ss, "microfacet", id++, closure_bsdf_microfacet_params(), closure_bsdf_microfacet_prepare);
-  register_closure(ss,
-                   "microfacet_ggx",
-                   id++,
-                   bsdf_microfacet_ggx_isotropic_params(),
-                   bsdf_microfacet_ggx_isotropic_prepare);
-  register_closure(
-      ss, "microfacet_ggx_aniso", id++, bsdf_microfacet_ggx_params(), bsdf_microfacet_ggx_prepare);
-  register_closure(ss,
-                   "microfacet_ggx_refraction",
-                   id++,
-                   bsdf_microfacet_ggx_refraction_params(),
-                   bsdf_microfacet_ggx_refraction_prepare);
-  register_closure(ss,
-                   "microfacet_multi_ggx",
-                   id++,
-                   closure_bsdf_microfacet_multi_ggx_params(),
-                   closure_bsdf_microfacet_multi_ggx_prepare);
-  register_closure(ss,
-                   "microfacet_multi_ggx_glass",
-                   id++,
-                   closure_bsdf_microfacet_multi_ggx_glass_params(),
-                   closure_bsdf_microfacet_multi_ggx_glass_prepare);
-  register_closure(ss,
-                   "microfacet_multi_ggx_aniso",
-                   id++,
-                   closure_bsdf_microfacet_multi_ggx_aniso_params(),
-                   closure_bsdf_microfacet_multi_ggx_aniso_prepare);
-  register_closure(ss,
-                   "microfacet_ggx_fresnel",
-                   id++,
-                   closure_bsdf_microfacet_ggx_fresnel_params(),
-                   closure_bsdf_microfacet_ggx_fresnel_prepare);
-  register_closure(ss,
-                   "microfacet_ggx_aniso_fresnel",
-                   id++,
-                   closure_bsdf_microfacet_ggx_aniso_fresnel_params(),
-                   closure_bsdf_microfacet_ggx_aniso_fresnel_prepare);
-  register_closure(ss,
-                   "microfacet_multi_ggx_fresnel",
-                   id++,
-                   closure_bsdf_microfacet_multi_ggx_fresnel_params(),
-                   closure_bsdf_microfacet_multi_ggx_fresnel_prepare);
-  register_closure(ss,
-                   "microfacet_multi_ggx_glass_fresnel",
-                   id++,
-                   closure_bsdf_microfacet_multi_ggx_glass_fresnel_params(),
-                   closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare);
-  register_closure(ss,
-                   "microfacet_multi_ggx_aniso_fresnel",
-                   id++,
-                   closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params(),
-                   closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare);
-  register_closure(ss,
-                   "microfacet_beckmann",
-                   id++,
-                   bsdf_microfacet_beckmann_isotropic_params(),
-                   bsdf_microfacet_beckmann_isotropic_prepare);
-  register_closure(ss,
-                   "microfacet_beckmann_aniso",
-                   id++,
-                   bsdf_microfacet_beckmann_params(),
-                   bsdf_microfacet_beckmann_prepare);
-  register_closure(ss,
-                   "microfacet_beckmann_refraction",
-                   id++,
-                   bsdf_microfacet_beckmann_refraction_params(),
-                   bsdf_microfacet_beckmann_refraction_prepare);
-  register_closure(ss,
-                   "ashikhmin_shirley",
-                   id++,
-                   bsdf_ashikhmin_shirley_params(),
-                   bsdf_ashikhmin_shirley_prepare);
-  register_closure(
-      ss, "ashikhmin_velvet", id++, bsdf_ashikhmin_velvet_params(), bsdf_ashikhmin_velvet_prepare);
-  register_closure(
-      ss, "diffuse_toon", id++, bsdf_diffuse_toon_params(), bsdf_diffuse_toon_prepare);
-  register_closure(ss, "glossy_toon", id++, bsdf_glossy_toon_params(), bsdf_glossy_toon_prepare);
-  register_closure(ss,
-                   "principled_diffuse",
-                   id++,
-                   bsdf_principled_diffuse_params(),
-                   bsdf_principled_diffuse_prepare);
-  register_closure(ss,
-                   "principled_sheen",
-                   id++,
-                   bsdf_principled_sheen_params(),
-                   closure_bsdf_principled_sheen_prepare);
-  register_closure(ss,
-                   "principled_clearcoat",
-                   id++,
-                   closure_bsdf_principled_clearcoat_params(),
-                   closure_bsdf_principled_clearcoat_prepare);
-
-  register_closure(ss, "emission", id++, closure_emission_params(), closure_emission_prepare);
-  register_closure(
-      ss, "background", id++, closure_background_params(), closure_background_prepare);
-  register_closure(ss, "holdout", id++, closure_holdout_params(), closure_holdout_prepare);
-  register_closure(ss,
-                   "diffuse_ramp",
-                   id++,
-                   closure_bsdf_diffuse_ramp_params(),
-                   closure_bsdf_diffuse_ramp_prepare);
-  register_closure(
-      ss, "phong_ramp", id++, closure_bsdf_phong_ramp_params(), closure_bsdf_phong_ramp_prepare);
-  register_closure(ss, "bssrdf", id++, closure_bssrdf_params(), closure_bssrdf_prepare);
-
-  register_closure(
-      ss, "hair_reflection", id++, bsdf_hair_reflection_params(), bsdf_hair_reflection_prepare);
-  register_closure(ss,
-                   "hair_transmission",
-                   id++,
-                   bsdf_hair_transmission_params(),
-                   bsdf_hair_transmission_prepare);
-
-  register_closure(ss,
-                   "principled_hair",
-                   id++,
-                   closure_bsdf_principled_hair_params(),
-                   closure_bsdf_principled_hair_prepare);
-
-  register_closure(ss,
-                   "henyey_greenstein",
-                   id++,
-                   closure_henyey_greenstein_params(),
-                   closure_henyey_greenstein_prepare);
-  register_closure(
-      ss, "absorption", id++, closure_absorption_params(), closure_absorption_prepare);
+#include "closures_template.h"
 }
 
-/* BSDF Closure */
+/* Globals */
 
-bool CBSDFClosure::skip(const ShaderData *sd, uint32_t path_flag, int scattering)
+static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
+                                        ShaderData *sd,
+                                        const void *state,
+                                        uint32_t path_flag,
+                                        OSLThreadData *tdata)
 {
-  /* caustic options */
-  if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-    const KernelGlobalsCPU *kg = sd->osl_globals;
-
-    if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
-        (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
-      return true;
-    }
+  OSL::ShaderGlobals *globals = &tdata->globals;
+
+  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+  const differential3 dI = differential_from_compact(sd->I, sd->dI);
+
+  /* copy from shader data to shader globals */
+  globals->P = TO_VEC3(sd->P);
+  globals->dPdx = TO_VEC3(dP.dx);
+  globals->dPdy = TO_VEC3(dP.dy);
+  globals->I = TO_VEC3(sd->I);
+  globals->dIdx = TO_VEC3(dI.dx);
+  globals->dIdy = TO_VEC3(dI.dy);
+  globals->N = TO_VEC3(sd->N);
+  globals->Ng = TO_VEC3(sd->Ng);
+  globals->u = sd->u;
+  globals->dudx = sd->du.dx;
+  globals->dudy = sd->du.dy;
+  globals->v = sd->v;
+  globals->dvdx = sd->dv.dx;
+  globals->dvdy = sd->dv.dy;
+  globals->dPdu = TO_VEC3(sd->dPdu);
+  globals->dPdv = TO_VEC3(sd->dPdv);
+  globals->surfacearea = 1.0f;
+  globals->time = sd->time;
+
+  /* booleans */
+  globals->raytype = path_flag;
+  globals->flipHandedness = 0;
+  globals->backfacing = (sd->flag & SD_BACKFACING);
+
+  /* shader data to be used in services callbacks */
+  globals->renderstate = sd;
+
+  /* hacky, we leave it to services to fetch actual object matrix */
+  globals->shader2common = sd;
+  globals->object2common = sd;
+
+  /* must be set to NULL before execute */
+  globals->Ci = NULL;
+
+  /* clear trace data */
+  tdata->tracedata.init = false;
+
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_path_state = nullptr;
+    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
   }
-
-  return false;
-}
-
-/* Standard Microfacet Closure */
-
-class MicrofacetClosure : public CBSDFClosure {
- public:
-  MicrofacetBsdf params;
-  ustring distribution;
-  int refract;
-
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    static ustring u_ggx("ggx");
-    static ustring u_default("default");
-
-    const int label = (refract) ? LABEL_TRANSMIT : LABEL_REFLECT;
-    if (skip(sd, path_flag, LABEL_GLOSSY | label)) {
-      return;
-    }
-
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
-        sd, sizeof(MicrofacetBsdf), weight, &params);
-
-    if (!bsdf) {
-      return;
-    }
-
-    /* GGX */
-    if (distribution == u_ggx || distribution == u_default) {
-      if (!refract) {
-        if (params.alpha_x == params.alpha_y) {
-          /* Isotropic */
-          sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
-        }
-        else {
-          /* Anisotropic */
-          sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
-        }
-      }
-      else {
-        sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
-      }
-    }
-    /* Beckmann */
-    else {
-      if (!refract) {
-        if (params.alpha_x == params.alpha_y) {
-          /* Isotropic */
-          sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
-        }
-        else {
-          /* Anisotropic */
-          sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
-        }
-      }
-      else {
-        sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
-      }
-    }
+  else {
+    sd->osl_path_state = (const IntegratorStateCPU *)state;
+    sd->osl_shadow_path_state = nullptr;
   }
-};
-
-ClosureParam *closure_bsdf_microfacet_params()
-{
-  static ClosureParam params[] = {CLOSURE_STRING_PARAM(MicrofacetClosure, distribution),
-                                  CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.N),
-                                  CLOSURE_FLOAT3_PARAM(MicrofacetClosure, params.T),
-                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_x),
-                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.alpha_y),
-                                  CLOSURE_FLOAT_PARAM(MicrofacetClosure, params.ior),
-                                  CLOSURE_INT_PARAM(MicrofacetClosure, refract),
-                                  CLOSURE_STRING_KEYPARAM(MicrofacetClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(MicrofacetClosure)};
-
-  return params;
 }
-CCLOSURE_PREPARE(closure_bsdf_microfacet_prepare, MicrofacetClosure)
-
-/* GGX closures with Fresnel */
-
-class MicrofacetFresnelClosure : public CBSDFClosure {
- public:
-  MicrofacetBsdf params;
-  float3 color;
-  float3 cspec0;
-
-  MicrofacetBsdf *alloc(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    /* Technically, the MultiGGX Glass closure may also transmit. However,
-     * since this is set statically and only used for caustic flags, this
-     * is probably as good as it gets. */
-    if (skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
-      return NULL;
-    }
 
-    MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
-        sd, sizeof(MicrofacetBsdf), weight, &params);
-    if (!bsdf) {
-      return NULL;
+static void flatten_closure_tree(const KernelGlobalsCPU *kg,
+                                 ShaderData *sd,
+                                 uint32_t path_flag,
+                                 const OSL::ClosureColor *closure,
+                                 float3 weight = make_float3(1.0f, 1.0f, 1.0f))
+{
+  /* OSL gives us a closure tree, we flatten it into arrays per
+   * closure type, for evaluation, sampling, etc later on. */
+
+  switch (closure->id) {
+    case OSL::ClosureColor::MUL: {
+      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
+      flatten_closure_tree(kg, sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
+      break;
     }
-
-    MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-    if (!extra) {
-      return NULL;
+    case OSL::ClosureColor::ADD: {
+      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
+      flatten_closure_tree(kg, sd, path_flag, add->closureA, weight);
+      flatten_closure_tree(kg, sd, path_flag, add->closureB, weight);
+      break;
     }
-
-    bsdf->extra = extra;
-    bsdf->extra->color = color;
-    bsdf->extra->cspec0 = cspec0;
-    bsdf->extra->clearcoat = 0.0f;
-    return bsdf;
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  case OSL_CLOSURE_##Upper##_ID: { \
+    const OSL::ClosureComponent *comp = reinterpret_cast<const OSL::ClosureComponent *>(closure); \
+    weight *= TO_FLOAT3(comp->w); \
+    osl_closure_##lower##_setup( \
+        kg, sd, path_flag, weight, reinterpret_cast<const Upper##Closure *>(comp + 1)); \
+    break; \
   }
-};
-
-class MicrofacetGGXFresnelClosure : public MicrofacetFresnelClosure {
- public:
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
-
-    bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->alpha_y = bsdf->alpha_x;
-    sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+#include "closures_template.h"
+    default:
+      break;
   }
-};
-
-ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
-      CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
-      CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
-      CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
-      CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
-      CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)};
-  return params;
 }
-CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_fresnel_prepare, MicrofacetGGXFresnelClosure);
 
-class MicrofacetGGXAnisoFresnelClosure : public MicrofacetFresnelClosure {
- public:
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
+/* Surface */
 
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
-
-    sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
-  }
-};
-
-ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params()
+void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
+                             const void *state,
+                             ShaderData *sd,
+                             uint32_t path_flag)
 {
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.N),
-      CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, params.T),
-      CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_x),
-      CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.alpha_y),
-      CLOSURE_FLOAT_PARAM(MicrofacetGGXFresnelClosure, params.ior),
-      CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, color),
-      CLOSURE_FLOAT3_PARAM(MicrofacetGGXFresnelClosure, cspec0),
-      CLOSURE_STRING_KEYPARAM(MicrofacetGGXFresnelClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetGGXFresnelClosure)};
-  return params;
-}
-CCLOSURE_PREPARE(closure_bsdf_microfacet_ggx_aniso_fresnel_prepare,
-                 MicrofacetGGXAnisoFresnelClosure);
-
-/* Multiscattering GGX closures */
-
-class MicrofacetMultiClosure : public CBSDFClosure {
- public:
-  MicrofacetBsdf params;
-  float3 color;
-
-  MicrofacetBsdf *alloc(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    /* Technically, the MultiGGX closure may also transmit. However,
-     * since this is set statically and only used for caustic flags, this
-     * is probably as good as it gets. */
-    if (skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
-      return NULL;
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
+
+  /* execute shader for this point */
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
+  OSL::ShaderGlobals *globals = &tdata->globals;
+  OSL::ShadingContext *octx = tdata->context;
+  int shader = sd->shader & SHADER_MASK;
+
+  /* automatic bump shader */
+  if (kg->osl->bump_state[shader]) {
+    /* save state */
+    const float3 P = sd->P;
+    const float dP = sd->dP;
+    const OSL::Vec3 dPdx = globals->dPdx;
+    const OSL::Vec3 dPdy = globals->dPdy;
+
+    /* set state as if undisplaced */
+    if (sd->flag & SD_HAS_DISPLACEMENT) {
+      float data[9];
+      bool found = kg->osl->services->get_attribute(sd,
+                                                    true,
+                                                    OSLRenderServices::u_empty,
+                                                    TypeDesc::TypeVector,
+                                                    OSLRenderServices::u_geom_undisplaced,
+                                                    data);
+      (void)found;
+      assert(found);
+
+      differential3 tmp_dP;
+      memcpy(&sd->P, data, sizeof(float) * 3);
+      memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
+      memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
+
+      object_position_transform(kg, sd, &sd->P);
+      object_dir_transform(kg, sd, &tmp_dP.dx);
+      object_dir_transform(kg, sd, &tmp_dP.dy);
+
+      sd->dP = differential_make_compact(tmp_dP);
+
+      globals->P = TO_VEC3(sd->P);
+      globals->dPdx = TO_VEC3(tmp_dP.dx);
+      globals->dPdy = TO_VEC3(tmp_dP.dy);
     }
 
-    MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
-        sd, sizeof(MicrofacetBsdf), weight, &params);
-    if (!bsdf) {
-      return NULL;
-    }
+    /* execute bump shader */
+    ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
 
-    MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-    if (!extra) {
-      return NULL;
-    }
+    /* reset state */
+    sd->P = P;
+    sd->dP = dP;
 
-    bsdf->extra = extra;
-    bsdf->extra->color = color;
-    bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->extra->clearcoat = 0.0f;
-    return bsdf;
+    globals->P = TO_VEC3(P);
+    globals->dPdx = TO_VEC3(dPdx);
+    globals->dPdy = TO_VEC3(dPdy);
   }
-};
 
-class MicrofacetMultiGGXClosure : public MicrofacetMultiClosure {
- public:
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
-
-    bsdf->ior = 0.0f;
-    bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->alpha_y = bsdf->alpha_x;
-    sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
+  /* surface shader */
+  if (kg->osl->surface_state[shader]) {
+    ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
   }
-};
-
-ClosureParam *closure_bsdf_microfacet_multi_ggx_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.N),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_x),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, color),
-      CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetMultiGGXClosure)};
-  return params;
-}
-CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_prepare, MicrofacetMultiGGXClosure);
 
-class MicrofacetMultiGGXAnisoClosure : public MicrofacetMultiClosure {
- public:
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
-
-    bsdf->ior = 0.0f;
-    sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
+  /* flatten closure tree */
+  if (globals->Ci) {
+    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
   }
-};
-
-ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.N),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.T),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_x),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_y),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, color),
-      CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetMultiGGXClosure)};
-  return params;
 }
-CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_prepare, MicrofacetMultiGGXAnisoClosure);
-
-class MicrofacetMultiGGXGlassClosure : public MicrofacetMultiClosure {
- public:
-  MicrofacetMultiGGXGlassClosure() : MicrofacetMultiClosure()
-  {
-  }
-
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
 
-    bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->alpha_y = bsdf->alpha_x;
-    sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
-  }
-};
+/* Background */
 
-ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params()
+void OSLShader::eval_background(const KernelGlobalsCPU *kg,
+                                const void *state,
+                                ShaderData *sd,
+                                uint32_t path_flag)
 {
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, params.N),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.alpha_x),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXClosure, params.ior),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXClosure, color),
-      CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetMultiGGXClosure)};
-  return params;
-}
-CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_prepare, MicrofacetMultiGGXGlassClosure);
-
-/* Multiscattering GGX closures with Fresnel */
-
-class MicrofacetMultiFresnelClosure : public CBSDFClosure {
- public:
-  MicrofacetBsdf params;
-  float3 color;
-  float3 cspec0;
-
-  MicrofacetBsdf *alloc(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    /* Technically, the MultiGGX closure may also transmit. However,
-     * since this is set statically and only used for caustic flags, this
-     * is probably as good as it gets. */
-    if (skip(sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
-      return NULL;
-    }
-
-    MicrofacetBsdf *bsdf = (MicrofacetBsdf *)bsdf_alloc_osl(
-        sd, sizeof(MicrofacetBsdf), weight, &params);
-    if (!bsdf) {
-      return NULL;
-    }
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
 
-    MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-    if (!extra) {
-      return NULL;
-    }
+  /* execute shader for this point */
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
+  OSL::ShaderGlobals *globals = &tdata->globals;
+  OSL::ShadingContext *octx = tdata->context;
 
-    bsdf->extra = extra;
-    bsdf->extra->color = color;
-    bsdf->extra->cspec0 = cspec0;
-    bsdf->extra->clearcoat = 0.0f;
-    return bsdf;
+  if (kg->osl->background_state) {
+    ss->execute(octx, *(kg->osl->background_state), *globals);
   }
-};
-
-class MicrofacetMultiGGXFresnelClosure : public MicrofacetMultiFresnelClosure {
- public:
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
 
-    bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->alpha_y = bsdf->alpha_x;
-    sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
+  /* return background color immediately */
+  if (globals->Ci) {
+    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
   }
-};
-
-ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
-      CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)};
-  return params;
 }
-CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_fresnel_prepare,
-                 MicrofacetMultiGGXFresnelClosure);
-
-class MicrofacetMultiGGXAnisoFresnelClosure : public MicrofacetMultiFresnelClosure {
- public:
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
 
-    sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
-  }
-};
+/* Volume */
 
-ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params()
+void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
+                            const void *state,
+                            ShaderData *sd,
+                            uint32_t path_flag)
 {
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.T),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_y),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
-      CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)};
-  return params;
-}
-CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare,
-                 MicrofacetMultiGGXAnisoFresnelClosure);
-
-class MicrofacetMultiGGXGlassFresnelClosure : public MicrofacetMultiFresnelClosure {
- public:
-  MicrofacetMultiGGXGlassFresnelClosure() : MicrofacetMultiFresnelClosure()
-  {
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
+
+  /* execute shader */
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
+  OSL::ShaderGlobals *globals = &tdata->globals;
+  OSL::ShadingContext *octx = tdata->context;
+  int shader = sd->shader & SHADER_MASK;
+
+  if (kg->osl->volume_state[shader]) {
+    ss->execute(octx, *(kg->osl->volume_state[shader]), *globals);
   }
 
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N);
-
-    MicrofacetBsdf *bsdf = alloc(sd, path_flag, weight);
-    if (!bsdf) {
-      return;
-    }
-
-    bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
-    bsdf->alpha_y = bsdf->alpha_x;
-    sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+  /* flatten closure tree */
+  if (globals->Ci) {
+    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
   }
-};
-
-ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, params.N),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.alpha_x),
-      CLOSURE_FLOAT_PARAM(MicrofacetMultiGGXFresnelClosure, params.ior),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, color),
-      CLOSURE_FLOAT3_PARAM(MicrofacetMultiGGXFresnelClosure, cspec0),
-      CLOSURE_STRING_KEYPARAM(MicrofacetMultiGGXFresnelClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(MicrofacetMultiGGXFresnelClosure)};
-  return params;
 }
-CCLOSURE_PREPARE(closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare,
-                 MicrofacetMultiGGXGlassFresnelClosure);
 
-/* Transparent */
+/* Displacement */
 
-class TransparentClosure : public CBSDFClosure {
- public:
-  ShaderClosure params;
-  float3 unused;
-
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    bsdf_transparent_setup(sd, weight, path_flag);
-  }
-};
-
-ClosureParam *closure_bsdf_transparent_params()
+void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
 {
-  static ClosureParam params[] = {CLOSURE_STRING_KEYPARAM(TransparentClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(TransparentClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_bsdf_transparent_prepare, TransparentClosure)
-
-/* Volume */
-
-class VolumeAbsorptionClosure : public CBSDFClosure {
- public:
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    volume_extinction_setup(sd, weight);
-  }
-};
-
-ClosureParam *closure_absorption_params()
-{
-  static ClosureParam params[] = {CLOSURE_STRING_KEYPARAM(VolumeAbsorptionClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(VolumeAbsorptionClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_absorption_prepare, VolumeAbsorptionClosure)
-
-class VolumeHenyeyGreensteinClosure : public CBSDFClosure {
- public:
-  HenyeyGreensteinVolume params;
-
-  void setup(ShaderData *sd, uint32_t path_flag, float3 weight)
-  {
-    volume_extinction_setup(sd, weight);
-
-    HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume *)bsdf_alloc_osl(
-        sd, sizeof(HenyeyGreensteinVolume), weight, &params);
-    if (!volume) {
-      return;
-    }
-
-    sd->flag |= volume_henyey_greenstein_setup(volume);
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
+
+  /* execute shader */
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
+  OSL::ShaderGlobals *globals = &tdata->globals;
+  OSL::ShadingContext *octx = tdata->context;
+  int shader = sd->shader & SHADER_MASK;
+
+  if (kg->osl->displacement_state[shader]) {
+    ss->execute(octx, *(kg->osl->displacement_state[shader]), *globals);
   }
-};
 
-ClosureParam *closure_henyey_greenstein_params()
-{
-  static ClosureParam params[] = {
-      CLOSURE_FLOAT_PARAM(VolumeHenyeyGreensteinClosure, params.g),
-      CLOSURE_STRING_KEYPARAM(VolumeHenyeyGreensteinClosure, label, "label"),
-      CLOSURE_FINISH_PARAM(VolumeHenyeyGreensteinClosure)};
-  return params;
+  /* get back position */
+  sd->P = TO_FLOAT3(globals->P);
 }
 
-CCLOSURE_PREPARE(closure_henyey_greenstein_prepare, VolumeHenyeyGreensteinClosure)
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/closures.h b/intern/cycles/kernel/osl/closures.h
deleted file mode 100644
index e10a3d88a04..00000000000
--- a/intern/cycles/kernel/osl/closures.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- *
- * Adapted from Open Shading Language
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011-2022 Blender Foundation. */
-
-#ifndef __OSL_CLOSURES_H__
-#define __OSL_CLOSURES_H__
-
-#include "kernel/types.h"
-#include "util/types.h"
-
-#include <OSL/genclosure.h>
-#include <OSL/oslclosure.h>
-#include <OSL/oslexec.h>
-
-CCL_NAMESPACE_BEGIN
-
-OSL::ClosureParam *closure_emission_params();
-OSL::ClosureParam *closure_background_params();
-OSL::ClosureParam *closure_holdout_params();
-OSL::ClosureParam *closure_bsdf_diffuse_ramp_params();
-OSL::ClosureParam *closure_bsdf_phong_ramp_params();
-OSL::ClosureParam *closure_bsdf_transparent_params();
-OSL::ClosureParam *closure_bssrdf_params();
-OSL::ClosureParam *closure_absorption_params();
-OSL::ClosureParam *closure_henyey_greenstein_params();
-OSL::ClosureParam *closure_bsdf_microfacet_params();
-OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_params();
-OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_params();
-OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_params();
-OSL::ClosureParam *closure_bsdf_microfacet_ggx_fresnel_params();
-OSL::ClosureParam *closure_bsdf_microfacet_ggx_aniso_fresnel_params();
-OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_fresnel_params();
-OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_glass_fresnel_params();
-OSL::ClosureParam *closure_bsdf_microfacet_multi_ggx_aniso_fresnel_params();
-OSL::ClosureParam *closure_bsdf_principled_clearcoat_params();
-
-void closure_emission_prepare(OSL::RendererServices *, int id, void *data);
-void closure_background_prepare(OSL::RendererServices *, int id, void *data);
-void closure_holdout_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_diffuse_ramp_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_phong_ramp_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_transparent_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bssrdf_prepare(OSL::RendererServices *, int id, void *data);
-void closure_absorption_prepare(OSL::RendererServices *, int id, void *data);
-void closure_henyey_greenstein_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_microfacet_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_microfacet_multi_ggx_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_microfacet_multi_ggx_glass_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_microfacet_multi_ggx_aniso_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_microfacet_ggx_fresnel_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_microfacet_ggx_aniso_fresnel_prepare(OSL::RendererServices *,
-                                                       int id,
-                                                       void *data);
-void closure_bsdf_microfacet_multi_ggx_fresnel_prepare(OSL::RendererServices *,
-                                                       int id,
-                                                       void *data);
-void closure_bsdf_microfacet_multi_ggx_glass_fresnel_prepare(OSL::RendererServices *,
-                                                             int id,
-                                                             void *data);
-void closure_bsdf_microfacet_multi_ggx_aniso_fresnel_prepare(OSL::RendererServices *,
-                                                             int id,
-                                                             void *data);
-void closure_bsdf_principled_clearcoat_prepare(OSL::RendererServices *, int id, void *data);
-void closure_bsdf_principled_hair_prepare(OSL::RendererServices *, int id, void *data);
-
-#define CCLOSURE_PREPARE(name, classname) \
-  void name(RendererServices *, int id, void *data) \
-  { \
-    memset(data, 0, sizeof(classname)); \
-    new (data) classname(); \
-  }
-
-#define CCLOSURE_PREPARE_STATIC(name, classname) static CCLOSURE_PREPARE(name, classname)
-
-#define CLOSURE_FLOAT3_PARAM(st, fld) \
-  { \
-    TypeDesc::TypeVector, (int)reckless_offsetof(st, fld), NULL, sizeof(OSL::Vec3) \
-  }
-
-#define BSDF_CLOSURE_FLOAT_PARAM(st, fld) CLOSURE_FLOAT_PARAM(st, fld),
-#define BSDF_CLOSURE_FLOAT3_PARAM(st, fld) CLOSURE_FLOAT3_PARAM(st, fld),
-
-#define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z)
-#define TO_COLOR3(v) OSL::Color3(v.x, v.y, v.z)
-#define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])
-
-/* Closure */
-
-class CClosurePrimitive {
- public:
-  virtual void setup(ShaderData *sd, uint32_t path_flag, float3 weight) = 0;
-
-  OSL::ustring label;
-};
-
-/* BSDF */
-
-class CBSDFClosure : public CClosurePrimitive {
- public:
-  bool skip(const ShaderData *sd, uint32_t path_flag, int scattering);
-};
-
-#define BSDF_CLOSURE_CLASS_BEGIN(Upper, lower, structname, TYPE) \
-\
-  class Upper##Closure : public CBSDFClosure { \
-   public: \
-    structname params; \
-    float3 unused; \
-\
-    void setup(ShaderData *sd, uint32_t path_flag, float3 weight) \
-    { \
-      if (!skip(sd, path_flag, TYPE)) { \
-        params.N = ensure_valid_reflection(sd->Ng, sd->I, params.N); \
-        structname *bsdf = (structname *)bsdf_alloc_osl(sd, sizeof(structname), weight, &params); \
-        sd->flag |= (bsdf) ? bsdf_##lower##_setup(bsdf) : 0; \
-      } \
-    } \
-  }; \
-\
-  static ClosureParam *bsdf_##lower##_params() \
-  { \
-    static ClosureParam params[] = {
-
-/* parameters */
-
-#define BSDF_CLOSURE_CLASS_END(Upper, lower) \
-  CLOSURE_STRING_KEYPARAM(Upper##Closure, label, "label"), CLOSURE_FINISH_PARAM(Upper##Closure) \
-  } \
-  ; \
-  return params; \
-  } \
-\
-  CCLOSURE_PREPARE_STATIC(bsdf_##lower##_prepare, Upper##Closure)
-
-CCL_NAMESPACE_END
-
-#endif /* __OSL_CLOSURES_H__ */
diff --git a/intern/cycles/kernel/osl/closures_setup.h b/intern/cycles/kernel/osl/closures_setup.h
new file mode 100644
index 00000000000..f8d68444f90
--- /dev/null
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -0,0 +1,1166 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */
+
+#pragma once
+
+// clang-format off
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_hair_principled.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
+#include "kernel/closure/volume.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
+#include "kernel/closure/bssrdf.h"
+#include "kernel/closure/emissive.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  struct ccl_align(8) Upper##Closure \
+  { \
+    const char *label;
+#define OSL_CLOSURE_STRUCT_END(Upper, lower) \
+  } \
+  ; \
+  ccl_device void osl_closure_##lower##_setup(KernelGlobals kg, \
+                                              ccl_private ShaderData *sd, \
+                                              uint32_t path_flag, \
+                                              float3 weight, \
+                                              ccl_private Upper##Closure *closure);
+#define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) type name;
+#define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) type name[size];
+
+#include "closures_template.h"
+
+ccl_device_forceinline bool osl_closure_skip(KernelGlobals kg,
+                                             ccl_private const ShaderData *sd,
+                                             uint32_t path_flag,
+                                             int scattering)
+{
+  /* caustic options */
+  if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
+    if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
+        (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/* Diffuse */
+
+ccl_device void osl_closure_diffuse_setup(KernelGlobals kg,
+                                          ccl_private ShaderData *sd,
+                                          uint32_t path_flag,
+                                          float3 weight,
+                                          ccl_private const DiffuseClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_DIFFUSE)) {
+    return;
+  }
+
+  ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
+      sd, sizeof(DiffuseBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+
+  sd->flag |= bsdf_diffuse_setup(bsdf);
+}
+
+ccl_device void osl_closure_oren_nayar_setup(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             uint32_t path_flag,
+                                             float3 weight,
+                                             ccl_private const OrenNayarClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_DIFFUSE)) {
+    return;
+  }
+
+  ccl_private OrenNayarBsdf *bsdf = (ccl_private OrenNayarBsdf *)bsdf_alloc(
+      sd, sizeof(OrenNayarBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->roughness = closure->roughness;
+
+  sd->flag |= bsdf_oren_nayar_setup(bsdf);
+}
+
+ccl_device void osl_closure_translucent_setup(KernelGlobals kg,
+                                              ccl_private ShaderData *sd,
+                                              uint32_t path_flag,
+                                              float3 weight,
+                                              ccl_private const TranslucentClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_DIFFUSE)) {
+    return;
+  }
+
+  ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
+      sd, sizeof(DiffuseBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+
+  sd->flag |= bsdf_translucent_setup(bsdf);
+}
+
+ccl_device void osl_closure_reflection_setup(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             uint32_t path_flag,
+                                             float3 weight,
+                                             ccl_private const ReflectionClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_SINGULAR)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+
+  sd->flag |= bsdf_reflection_setup(bsdf);
+}
+
+ccl_device void osl_closure_refraction_setup(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             uint32_t path_flag,
+                                             float3 weight,
+                                             ccl_private const RefractionClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_SINGULAR)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->ior = closure->ior;
+
+  sd->flag |= bsdf_refraction_setup(bsdf);
+}
+
+ccl_device void osl_closure_transparent_setup(KernelGlobals kg,
+                                              ccl_private ShaderData *sd,
+                                              uint32_t path_flag,
+                                              float3 weight,
+                                              ccl_private const TransparentClosure *closure)
+{
+  bsdf_transparent_setup(sd, rgb_to_spectrum(weight), path_flag);
+}
+
+/* Standard microfacet closures */
+
+ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             uint32_t path_flag,
+                                             float3 weight,
+                                             ccl_private const MicrofacetClosure *closure)
+{
+  const int label = (closure->refract) ? LABEL_TRANSMIT : LABEL_REFLECT;
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | label)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = closure->alpha_y;
+  bsdf->ior = closure->ior;
+  bsdf->T = closure->T;
+
+  static OSL::ustring u_ggx("ggx");
+  static OSL::ustring u_default("default");
+
+  /* GGX */
+  if (closure->distribution == u_ggx || closure->distribution == u_default) {
+    if (!closure->refract) {
+      if (closure->alpha_x == closure->alpha_y) {
+        /* Isotropic */
+        sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+      }
+      else {
+        /* Anisotropic */
+        sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
+      }
+    }
+    else {
+      sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+    }
+  }
+  /* Beckmann */
+  else {
+    if (!closure->refract) {
+      if (closure->alpha_x == closure->alpha_y) {
+        /* Isotropic */
+        sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+      }
+      else {
+        /* Anisotropic */
+        sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
+      }
+    }
+    else {
+      sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+    }
+  }
+}
+
+ccl_device void osl_closure_microfacet_ggx_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetGGXIsotropicClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+
+  sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+}
+
+ccl_device void osl_closure_microfacet_ggx_aniso_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetGGXClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = closure->alpha_y;
+  bsdf->T = closure->T;
+
+  sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
+}
+
+ccl_device void osl_closure_microfacet_ggx_refraction_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetGGXRefractionClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_TRANSMIT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->ior = closure->ior;
+
+  sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+}
+
+/* GGX closures with Fresnel */
+
+ccl_device void osl_closure_microfacet_ggx_fresnel_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetGGXFresnelClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->ior = closure->ior;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = zero_float3();
+
+  sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+}
+
+ccl_device void osl_closure_microfacet_ggx_aniso_fresnel_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetGGXAnisoFresnelClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = closure->alpha_y;
+  bsdf->ior = closure->ior;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = closure->T;
+
+  sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
+}
+
+/* Multi-scattering GGX closures */
+
+ccl_device void osl_closure_microfacet_multi_ggx_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetMultiGGXClosure *closure)
+{
+  /* Technically, the MultiGGX closure may also transmit. However,
+   * since this is set statically and only used for caustic flags, this
+   * is probably as good as it gets. */
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->ior = 0.0f;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = zero_spectrum();
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = zero_float3();
+
+  sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
+}
+
+ccl_device void osl_closure_microfacet_multi_ggx_glass_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetMultiGGXGlassClosure *closure)
+{
+  /* Technically, the MultiGGX closure may also transmit. However,
+   * since this is set statically and only used for caustic flags, this
+   * is probably as good as it gets. */
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->ior = closure->ior;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = zero_spectrum();
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = zero_float3();
+
+  sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
+}
+
+ccl_device void osl_closure_microfacet_multi_ggx_aniso_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetMultiGGXAnisoClosure *closure)
+{
+  /* Technically, the MultiGGX closure may also transmit. However,
+   * since this is set statically and only used for caustic flags, this
+   * is probably as good as it gets. */
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = closure->alpha_y;
+  bsdf->ior = 0.0f;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = zero_spectrum();
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = closure->T;
+
+  sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
+}
+
+/* Multi-scattering GGX closures with Fresnel */
+
+ccl_device void osl_closure_microfacet_multi_ggx_fresnel_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetMultiGGXFresnelClosure *closure)
+{
+  /* Technically, the MultiGGX closure may also transmit. However,
+   * since this is set statically and only used for caustic flags, this
+   * is probably as good as it gets. */
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->ior = closure->ior;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = zero_float3();
+
+  sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
+}
+
+ccl_device void osl_closure_microfacet_multi_ggx_glass_fresnel_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetMultiGGXGlassFresnelClosure *closure)
+{
+  /* Technically, the MultiGGX closure may also transmit. However,
+   * since this is set statically and only used for caustic flags, this
+   * is probably as good as it gets. */
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = bsdf->alpha_x;
+  bsdf->ior = closure->ior;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = zero_float3();
+
+  sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
+}
+
+ccl_device void osl_closure_microfacet_multi_ggx_aniso_fresnel_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetMultiGGXAnisoFresnelClosure *closure)
+{
+  /* Technically, the MultiGGX closure may also transmit. However,
+   * since this is set statically and only used for caustic flags, this
+   * is probably as good as it gets. */
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private MicrofacetExtra *extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(
+      sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = closure->alpha_y;
+  bsdf->ior = closure->ior;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = rgb_to_spectrum(closure->color);
+  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
+  bsdf->extra->clearcoat = 0.0f;
+
+  bsdf->T = closure->T;
+
+  sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
+}
+
+/* Beckmann closures */
+
+ccl_device void osl_closure_microfacet_beckmann_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetBeckmannIsotropicClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+
+  sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+}
+
+ccl_device void osl_closure_microfacet_beckmann_aniso_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetBeckmannClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = closure->alpha_y;
+  bsdf->T = closure->T;
+
+  sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
+}
+
+ccl_device void osl_closure_microfacet_beckmann_refraction_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const MicrofacetBeckmannRefractionClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_TRANSMIT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->ior = closure->ior;
+
+  sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+}
+
+/* Ashikhmin closures */
+
+ccl_device void osl_closure_ashikhmin_velvet_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const AshikhminVelvetClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_DIFFUSE)) {
+    return;
+  }
+
+  ccl_private VelvetBsdf *bsdf = (ccl_private VelvetBsdf *)bsdf_alloc(
+      sd, sizeof(VelvetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->sigma = closure->sigma;
+
+  sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf);
+}
+
+ccl_device void osl_closure_ashikhmin_shirley_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const AshikhminShirleyClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY | LABEL_REFLECT)) {
+    return;
+  }
+
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_y = closure->alpha_y;
+  bsdf->T = closure->T;
+
+  sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf);
+}
+
+ccl_device void osl_closure_diffuse_toon_setup(KernelGlobals kg,
+                                               ccl_private ShaderData *sd,
+                                               uint32_t path_flag,
+                                               float3 weight,
+                                               ccl_private const DiffuseToonClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_DIFFUSE)) {
+    return;
+  }
+
+  ccl_private ToonBsdf *bsdf = (ccl_private ToonBsdf *)bsdf_alloc(
+      sd, sizeof(ToonBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->size = closure->size;
+  bsdf->smooth = closure->smooth;
+
+  sd->flag |= bsdf_diffuse_toon_setup(bsdf);
+}
+
+ccl_device void osl_closure_glossy_toon_setup(KernelGlobals kg,
+                                              ccl_private ShaderData *sd,
+                                              uint32_t path_flag,
+                                              float3 weight,
+                                              ccl_private const GlossyToonClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY)) {
+    return;
+  }
+
+  ccl_private ToonBsdf *bsdf = (ccl_private ToonBsdf *)bsdf_alloc(
+      sd, sizeof(ToonBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->size = closure->size;
+  bsdf->smooth = closure->smooth;
+
+  sd->flag |= bsdf_glossy_toon_setup(bsdf);
+}
+
+/* Disney principled closures */
+
+ccl_device void osl_closure_principled_diffuse_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const PrincipledDiffuseClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_DIFFUSE)) {
+    return;
+  }
+
+  ccl_private PrincipledDiffuseBsdf *bsdf = (ccl_private PrincipledDiffuseBsdf *)bsdf_alloc(
+      sd, sizeof(PrincipledDiffuseBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->roughness = closure->roughness;
+
+  sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+}
+
+ccl_device void osl_closure_principled_sheen_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const PrincipledSheenClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_DIFFUSE)) {
+    return;
+  }
+
+  ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)bsdf_alloc(
+      sd, sizeof(PrincipledSheenBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->avg_value = 0.0f;
+
+  sd->flag |= bsdf_principled_sheen_setup(sd, bsdf);
+}
+
+ccl_device void osl_closure_principled_clearcoat_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const PrincipledClearcoatClosure *closure)
+{
+  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->alpha_x = closure->clearcoat_roughness;
+  bsdf->alpha_y = closure->clearcoat_roughness;
+  bsdf->ior = 1.5f;
+
+  bsdf->extra = extra;
+  bsdf->extra->color = zero_spectrum();
+  bsdf->extra->cspec0 = make_spectrum(0.04f);
+  bsdf->extra->clearcoat = closure->clearcoat;
+
+  bsdf->T = zero_float3();
+
+  sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
+}
+
+/* Variable cone emissive closure
+ *
+ * This primitive emits in a cone having a configurable penumbra area where the light decays to 0
+ * reaching the outer_angle limit. It can also behave as a lambertian emitter if the provided
+ * angles are PI/2, which is the default
+ */
+ccl_device void osl_closure_emission_setup(KernelGlobals kg,
+                                           ccl_private ShaderData *sd,
+                                           uint32_t /* path_flag */,
+                                           float3 weight,
+                                           ccl_private const GenericEmissiveClosure *closure)
+{
+  emission_setup(sd, rgb_to_spectrum(weight));
+}
+
+/* Generic background closure
+ *
+ * We only have a background closure for the shaders to return a color in background shaders. No
+ * methods, only the weight is taking into account
+ */
+ccl_device void osl_closure_background_setup(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             uint32_t /* path_flag */,
+                                             float3 weight,
+                                             ccl_private const GenericBackgroundClosure *closure)
+{
+  background_setup(sd, rgb_to_spectrum(weight));
+}
+
+/* Holdout closure
+ *
+ * This will be used by the shader to mark the amount of holdout for the current shading point. No
+ * parameters, only the weight will be used
+ */
+ccl_device void osl_closure_holdout_setup(KernelGlobals kg,
+                                          ccl_private ShaderData *sd,
+                                          uint32_t /* path_flag */,
+                                          float3 weight,
+                                          ccl_private const HoldoutClosure *closure)
+{
+  closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, rgb_to_spectrum(weight));
+  sd->flag |= SD_HOLDOUT;
+}
+
+ccl_device void osl_closure_diffuse_ramp_setup(KernelGlobals kg,
+                                               ccl_private ShaderData *sd,
+                                               uint32_t /* path_flag */,
+                                               float3 weight,
+                                               ccl_private const DiffuseRampClosure *closure)
+{
+  ccl_private DiffuseRampBsdf *bsdf = (ccl_private DiffuseRampBsdf *)bsdf_alloc(
+      sd, sizeof(DiffuseRampBsdf), rgb_to_spectrum(weight));
+
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+
+  bsdf->colors = (float3 *)closure_alloc_extra(sd, sizeof(float3) * 8);
+  if (!bsdf->colors) {
+    return;
+  }
+
+  for (int i = 0; i < 8; i++)
+    bsdf->colors[i] = closure->colors[i];
+
+  sd->flag |= bsdf_diffuse_ramp_setup(bsdf);
+}
+
+ccl_device void osl_closure_phong_ramp_setup(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             uint32_t /* path_flag */,
+                                             float3 weight,
+                                             ccl_private const PhongRampClosure *closure)
+{
+  ccl_private PhongRampBsdf *bsdf = (ccl_private PhongRampBsdf *)bsdf_alloc(
+      sd, sizeof(PhongRampBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->exponent = closure->exponent;
+
+  bsdf->colors = (float3 *)closure_alloc_extra(sd, sizeof(float3) * 8);
+  if (!bsdf->colors) {
+    return;
+  }
+
+  for (int i = 0; i < 8; i++)
+    bsdf->colors[i] = closure->colors[i];
+
+  sd->flag |= bsdf_phong_ramp_setup(bsdf);
+}
+
+ccl_device void osl_closure_bssrdf_setup(KernelGlobals kg,
+                                         ccl_private ShaderData *sd,
+                                         uint32_t path_flag,
+                                         float3 weight,
+                                         ccl_private const BSSRDFClosure *closure)
+{
+  static ustring u_burley("burley");
+  static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
+  static ustring u_random_walk("random_walk");
+
+  ClosureType type;
+  if (closure->method == u_burley) {
+    type = CLOSURE_BSSRDF_BURLEY_ID;
+  }
+  else if (closure->method == u_random_walk_fixed_radius) {
+    type = CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID;
+  }
+  else if (closure->method == u_random_walk) {
+    type = CLOSURE_BSSRDF_RANDOM_WALK_ID;
+  }
+  else {
+    return;
+  }
+
+  ccl_private Bssrdf *bssrdf = bssrdf_alloc(sd, rgb_to_spectrum(weight));
+  if (!bssrdf) {
+    return;
+  }
+
+  /* disable in case of diffuse ancestor, can't see it well then and
+   * adds considerably noise due to probabilities of continuing path
+   * getting lower and lower */
+  if (path_flag & PATH_RAY_DIFFUSE_ANCESTOR) {
+    bssrdf->radius = zero_spectrum();
+  }
+  else {
+    bssrdf->radius = closure->radius;
+  }
+
+  /* create one closure per color channel */
+  bssrdf->albedo = closure->albedo;
+  bssrdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bssrdf->roughness = closure->roughness;
+  bssrdf->anisotropy = clamp(closure->anisotropy, 0.0f, 0.9f);
+
+  sd->flag |= bssrdf_setup(sd, bssrdf, type, clamp(closure->ior, 1.01f, 3.8f));
+}
+
+/* Hair */
+
+ccl_device void osl_closure_hair_reflection_setup(KernelGlobals kg,
+                                                  ccl_private ShaderData *sd,
+                                                  uint32_t path_flag,
+                                                  float3 weight,
+                                                  ccl_private const HairReflectionClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY)) {
+    return;
+  }
+
+  ccl_private HairBsdf *bsdf = (ccl_private HairBsdf *)bsdf_alloc(
+      sd, sizeof(HairBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->T = closure->T;
+  bsdf->roughness1 = closure->roughness1;
+  bsdf->roughness2 = closure->roughness2;
+  bsdf->offset = closure->offset;
+
+  sd->flag |= bsdf_hair_reflection_setup(bsdf);
+}
+
+ccl_device void osl_closure_hair_transmission_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const HairTransmissionClosure *closure)
+{
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY)) {
+    return;
+  }
+
+  ccl_private HairBsdf *bsdf = (ccl_private HairBsdf *)bsdf_alloc(
+      sd, sizeof(HairBsdf), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->T = closure->T;
+  bsdf->roughness1 = closure->roughness1;
+  bsdf->roughness2 = closure->roughness2;
+  bsdf->offset = closure->offset;
+
+  sd->flag |= bsdf_hair_transmission_setup(bsdf);
+}
+
+ccl_device void osl_closure_principled_hair_setup(KernelGlobals kg,
+                                                  ccl_private ShaderData *sd,
+                                                  uint32_t path_flag,
+                                                  float3 weight,
+                                                  ccl_private const PrincipledHairClosure *closure)
+{
+#ifdef __HAIR__
+  if (osl_closure_skip(kg, sd, path_flag, LABEL_GLOSSY)) {
+    return;
+  }
+
+  ccl_private PrincipledHairBSDF *bsdf = (ccl_private PrincipledHairBSDF *)bsdf_alloc(
+      sd, sizeof(PrincipledHairBSDF), rgb_to_spectrum(weight));
+  if (!bsdf) {
+    return;
+  }
+
+  ccl_private PrincipledHairExtra *extra = (ccl_private PrincipledHairExtra *)closure_alloc_extra(
+      sd, sizeof(PrincipledHairExtra));
+  if (!extra) {
+    return;
+  }
+
+  bsdf->N = ensure_valid_reflection(sd->Ng, sd->I, closure->N);
+  bsdf->sigma = closure->sigma;
+  bsdf->v = closure->v;
+  bsdf->s = closure->s;
+  bsdf->alpha = closure->alpha;
+  bsdf->eta = closure->eta;
+  bsdf->m0_roughness = closure->m0_roughness;
+
+  bsdf->extra = extra;
+
+  sd->flag |= bsdf_principled_hair_setup(sd, bsdf);
+#endif
+}
+
+/* Volume */
+
+ccl_device void osl_closure_absorption_setup(KernelGlobals kg,
+                                             ccl_private ShaderData *sd,
+                                             uint32_t path_flag,
+                                             float3 weight,
+                                             ccl_private const VolumeAbsorptionClosure *closure)
+{
+  volume_extinction_setup(sd, rgb_to_spectrum(weight));
+}
+
+ccl_device void osl_closure_henyey_greenstein_setup(
+    KernelGlobals kg,
+    ccl_private ShaderData *sd,
+    uint32_t path_flag,
+    float3 weight,
+    ccl_private const VolumeHenyeyGreensteinClosure *closure)
+{
+  volume_extinction_setup(sd, rgb_to_spectrum(weight));
+
+  ccl_private HenyeyGreensteinVolume *volume = (ccl_private HenyeyGreensteinVolume *)bsdf_alloc(
+      sd, sizeof(HenyeyGreensteinVolume), rgb_to_spectrum(weight));
+  if (!volume) {
+    return;
+  }
+
+  volume->g = closure->g;
+
+  sd->flag |= volume_henyey_greenstein_setup(volume);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/closures_template.h b/intern/cycles/kernel/osl/closures_template.h
new file mode 100644
index 00000000000..c808b275966
--- /dev/null
+++ b/intern/cycles/kernel/osl/closures_template.h
@@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef OSL_CLOSURE_STRUCT_BEGIN
+#  define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower)
+#endif
+#ifndef OSL_CLOSURE_STRUCT_END
+#  define OSL_CLOSURE_STRUCT_END(Upper, lower)
+#endif
+#ifndef OSL_CLOSURE_STRUCT_MEMBER
+#  define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key)
+#endif
+#ifndef OSL_CLOSURE_STRUCT_ARRAY_MEMBER
+#  define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size)
+#endif
+
+OSL_CLOSURE_STRUCT_BEGIN(Diffuse, diffuse)
+  OSL_CLOSURE_STRUCT_MEMBER(Diffuse, VECTOR, packed_float3, N, NULL)
+OSL_CLOSURE_STRUCT_END(Diffuse, diffuse)
+
+OSL_CLOSURE_STRUCT_BEGIN(OrenNayar, oren_nayar)
+  OSL_CLOSURE_STRUCT_MEMBER(OrenNayar, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(OrenNayar, FLOAT, float, roughness, NULL)
+OSL_CLOSURE_STRUCT_END(OrenNayar, oren_nayar)
+
+OSL_CLOSURE_STRUCT_BEGIN(Translucent, translucent)
+  OSL_CLOSURE_STRUCT_MEMBER(Translucent, VECTOR, packed_float3, N, NULL)
+OSL_CLOSURE_STRUCT_END(Translucent, translucent)
+
+OSL_CLOSURE_STRUCT_BEGIN(Reflection, reflection)
+  OSL_CLOSURE_STRUCT_MEMBER(Reflection, VECTOR, packed_float3, N, NULL)
+OSL_CLOSURE_STRUCT_END(Reflection, reflection)
+
+OSL_CLOSURE_STRUCT_BEGIN(Refraction, refraction)
+  OSL_CLOSURE_STRUCT_MEMBER(Refraction, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Refraction, FLOAT, float, ior, NULL)
+OSL_CLOSURE_STRUCT_END(Refraction, refraction)
+
+OSL_CLOSURE_STRUCT_BEGIN(Transparent, transparent)
+OSL_CLOSURE_STRUCT_END(Transparent, transparent)
+
+OSL_CLOSURE_STRUCT_BEGIN(Microfacet, microfacet)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, ustring, distribution, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_y, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, ior, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, INT, int, refract, NULL)
+OSL_CLOSURE_STRUCT_END(Microfacet, microfacet)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetGGXIsotropic, microfacet_ggx)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXIsotropic, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXIsotropic, FLOAT, float, alpha_x, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetGGXIsotropic, microfacet_ggx)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetGGX, microfacet_ggx_aniso)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGX, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGX, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGX, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGX, FLOAT, float, alpha_y, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetGGX, microfacet_ggx_aniso)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetGGXRefraction, microfacet_ggx_refraction)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXRefraction, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXRefraction, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXRefraction, FLOAT, float, ior, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetGGXRefraction, microfacet_ggx_refraction)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetMultiGGX, microfacet_multi_ggx)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGX, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGX, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGX, VECTOR, packed_float3, color, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetMultiGGX, microfacet_multi_ggx)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetMultiGGXGlass, microfacet_multi_ggx_glass)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlass, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlass, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlass, FLOAT, float, ior, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlass, VECTOR, packed_float3, color, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetMultiGGXGlass, microfacet_multi_ggx_glass)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetMultiGGXAniso, microfacet_multi_ggx_aniso)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAniso, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAniso, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAniso, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAniso, FLOAT, float, alpha_y, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAniso, VECTOR, packed_float3, color, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetMultiGGXAniso, microfacet_multi_ggx_aniso)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetGGXFresnel, microfacet_ggx_fresnel)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXFresnel, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXFresnel, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXFresnel, FLOAT, float, ior, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXFresnel, VECTOR, packed_float3, color, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXFresnel, VECTOR, packed_float3, cspec0, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetGGXFresnel, microfacet_ggx_fresnel)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetGGXAnisoFresnel, microfacet_ggx_aniso_fresnel)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXAnisoFresnel, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXAnisoFresnel, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXAnisoFresnel, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXAnisoFresnel, FLOAT, float, alpha_y, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXAnisoFresnel, FLOAT, float, ior, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXAnisoFresnel, VECTOR, packed_float3, color, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetGGXAnisoFresnel, VECTOR, packed_float3, cspec0, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetGGXAnisoFresnel, microfacet_ggx_aniso_fresnel)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetMultiGGXFresnel, microfacet_multi_ggx_fresnel)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXFresnel, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXFresnel, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXFresnel, FLOAT, float, ior, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXFresnel, VECTOR, packed_float3, color, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXFresnel, VECTOR, packed_float3, cspec0, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetMultiGGXFresnel, microfacet_multi_ggx_fresnel)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetMultiGGXGlassFresnel, microfacet_multi_ggx_glass_fresnel)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlassFresnel, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlassFresnel, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlassFresnel, FLOAT, float, ior, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlassFresnel, VECTOR, packed_float3, color, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXGlassFresnel, VECTOR, packed_float3, cspec0, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetMultiGGXGlassFresnel, microfacet_multi_ggx_glass_fresnel)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetMultiGGXAnisoFresnel, microfacet_multi_ggx_aniso_fresnel)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAnisoFresnel, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAnisoFresnel, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAnisoFresnel, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAnisoFresnel, FLOAT, float, alpha_y, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAnisoFresnel, FLOAT, float, ior, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAnisoFresnel, VECTOR, packed_float3, color, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetMultiGGXAnisoFresnel, VECTOR, packed_float3, cspec0, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetMultiGGXAnisoFresnel, microfacet_multi_ggx_aniso_fresnel)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetBeckmannIsotropic, microfacet_beckmann)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmannIsotropic, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmannIsotropic, FLOAT, float, alpha_x, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetBeckmannIsotropic, microfacet_beckmann)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetBeckmann, microfacet_beckmann_aniso)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmann, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmann, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmann, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmann, FLOAT, float, alpha_y, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetBeckmann, microfacet_beckmann_aniso)
+
+OSL_CLOSURE_STRUCT_BEGIN(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmannRefraction, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmannRefraction, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(MicrofacetBeckmannRefraction, FLOAT, float, ior, NULL)
+OSL_CLOSURE_STRUCT_END(MicrofacetBeckmannRefraction, microfacet_beckmann_refraction)
+
+OSL_CLOSURE_STRUCT_BEGIN(AshikhminShirley, ashikhmin_shirley)
+  OSL_CLOSURE_STRUCT_MEMBER(AshikhminShirley, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(AshikhminShirley, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(AshikhminShirley, FLOAT, float, alpha_x, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(AshikhminShirley, FLOAT, float, alpha_y, NULL)
+OSL_CLOSURE_STRUCT_END(AshikhminShirley, ashikhmin_shirley)
+
+OSL_CLOSURE_STRUCT_BEGIN(AshikhminVelvet, ashikhmin_velvet)
+  OSL_CLOSURE_STRUCT_MEMBER(AshikhminVelvet, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(AshikhminVelvet, FLOAT, float, sigma, NULL)
+OSL_CLOSURE_STRUCT_END(AshikhminVelvet, ashikhmin_velvet)
+
+OSL_CLOSURE_STRUCT_BEGIN(DiffuseToon, diffuse_toon)
+  OSL_CLOSURE_STRUCT_MEMBER(DiffuseToon, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(DiffuseToon, FLOAT, float, size, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(DiffuseToon, FLOAT, float, smooth, NULL)
+OSL_CLOSURE_STRUCT_END(DiffuseToon, diffuse_toon)
+
+OSL_CLOSURE_STRUCT_BEGIN(GlossyToon, glossy_toon)
+  OSL_CLOSURE_STRUCT_MEMBER(GlossyToon, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(GlossyToon, FLOAT, float, size, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(GlossyToon, FLOAT, float, smooth, NULL)
+OSL_CLOSURE_STRUCT_END(GlossyToon, glossy_toon)
+
+OSL_CLOSURE_STRUCT_BEGIN(PrincipledDiffuse, principled_diffuse)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledDiffuse, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledDiffuse, FLOAT, float, roughness, NULL)
+OSL_CLOSURE_STRUCT_END(PrincipledDiffuse, principled_diffuse)
+
+OSL_CLOSURE_STRUCT_BEGIN(PrincipledSheen, principled_sheen)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledSheen, VECTOR, packed_float3, N, NULL)
+OSL_CLOSURE_STRUCT_END(PrincipledSheen, principled_sheen)
+
+OSL_CLOSURE_STRUCT_BEGIN(PrincipledClearcoat, principled_clearcoat)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledClearcoat, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledClearcoat, FLOAT, float, clearcoat, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledClearcoat, FLOAT, float, clearcoat_roughness, NULL)
+OSL_CLOSURE_STRUCT_END(PrincipledClearcoat, principled_clearcoat)
+
+OSL_CLOSURE_STRUCT_BEGIN(GenericEmissive, emission)
+OSL_CLOSURE_STRUCT_END(GenericEmissive, emission)
+
+OSL_CLOSURE_STRUCT_BEGIN(GenericBackground, background)
+OSL_CLOSURE_STRUCT_END(GenericBackground, background)
+
+OSL_CLOSURE_STRUCT_BEGIN(Holdout, holdout)
+OSL_CLOSURE_STRUCT_END(Holdout, holdout)
+
+OSL_CLOSURE_STRUCT_BEGIN(DiffuseRamp, diffuse_ramp)
+  OSL_CLOSURE_STRUCT_MEMBER(DiffuseRamp, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_ARRAY_MEMBER(DiffuseRamp, COLOR, packed_float3, colors, NULL, 8)
+OSL_CLOSURE_STRUCT_END(DiffuseRamp, diffuse_ramp)
+
+OSL_CLOSURE_STRUCT_BEGIN(PhongRamp, phong_ramp)
+  OSL_CLOSURE_STRUCT_MEMBER(PhongRamp, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PhongRamp, FLOAT, float, exponent, NULL)
+  OSL_CLOSURE_STRUCT_ARRAY_MEMBER(PhongRamp, COLOR, packed_float3, colors, NULL, 8)
+OSL_CLOSURE_STRUCT_END(PhongRamp, phong_ramp)
+
+OSL_CLOSURE_STRUCT_BEGIN(BSSRDF, bssrdf)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, ustring, method, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, radius, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, albedo, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, FLOAT, float, roughness, "roughness")
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, FLOAT, float, ior, "ior")
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, FLOAT, float, anisotropy, "anisotropy")
+OSL_CLOSURE_STRUCT_END(BSSRDF, bssrdf)
+
+OSL_CLOSURE_STRUCT_BEGIN(HairReflection, hair_reflection)
+  OSL_CLOSURE_STRUCT_MEMBER(HairReflection, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairReflection, FLOAT, float, roughness1, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairReflection, FLOAT, float, roughness2, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairReflection, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairReflection, FLOAT, float, offset, NULL)
+OSL_CLOSURE_STRUCT_END(HairReflection, hair_reflection)
+
+OSL_CLOSURE_STRUCT_BEGIN(HairTransmission, hair_transmission)
+  OSL_CLOSURE_STRUCT_MEMBER(HairTransmission, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairTransmission, FLOAT, float, roughness1, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairTransmission, FLOAT, float, roughness2, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairReflection, VECTOR, packed_float3, T, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(HairReflection, FLOAT, float, offset, NULL)
+OSL_CLOSURE_STRUCT_END(HairTransmission, hair_transmission)
+
+OSL_CLOSURE_STRUCT_BEGIN(PrincipledHair, principled_hair)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledHair, VECTOR, packed_float3, N, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledHair, VECTOR, packed_float3, sigma, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledHair, FLOAT, float, v, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledHair, FLOAT, float, s, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledHair, FLOAT, float, m0_roughness, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledHair, FLOAT, float, alpha, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(PrincipledHair, FLOAT, float, eta, NULL)
+OSL_CLOSURE_STRUCT_END(PrincipledHair, principled_hair)
+
+OSL_CLOSURE_STRUCT_BEGIN(VolumeAbsorption, absorption)
+OSL_CLOSURE_STRUCT_END(VolumeAbsorption, absorption)
+
+OSL_CLOSURE_STRUCT_BEGIN(VolumeHenyeyGreenstein, henyey_greenstein)
+  OSL_CLOSURE_STRUCT_MEMBER(VolumeHenyeyGreenstein, FLOAT, float, g, NULL)
+OSL_CLOSURE_STRUCT_END(VolumeHenyeyGreenstein, henyey_greenstein)
+
+#undef OSL_CLOSURE_STRUCT_BEGIN
+#undef OSL_CLOSURE_STRUCT_END
+#undef OSL_CLOSURE_STRUCT_MEMBER
+#undef OSL_CLOSURE_STRUCT_ARRAY_MEMBER
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
deleted file mode 100644
index 1a01b215836..00000000000
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- *
- * Adapted from Open Shading Language
- * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
- * All Rights Reserved.
- *
- * Modifications Copyright 2011-2022 Blender Foundation. */
-
-#include <OpenImageIO/fmath.h>
-
-#include <OSL/genclosure.h>
-
-#include "kernel/osl/closures.h"
-
-// clang-format off
-#include "kernel/device/cpu/compat.h"
-#include "kernel/types.h"
-#include "kernel/closure/alloc.h"
-#include "kernel/closure/emissive.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-using namespace OSL;
-
-/// Variable cone emissive closure
-///
-/// This primitive emits in a cone having a configurable
-/// penumbra area where the light decays to 0 reaching the
-/// outer_angle limit. It can also behave as a lambertian emitter
-/// if the provided angles are PI/2, which is the default
-///
-class GenericEmissiveClosure : public CClosurePrimitive {
- public:
-  void setup(ShaderData *sd, uint32_t /* path_flag */, float3 weight)
-  {
-    emission_setup(sd, weight);
-  }
-};
-
-ClosureParam *closure_emission_params()
-{
-  static ClosureParam params[] = {CLOSURE_STRING_KEYPARAM(GenericEmissiveClosure, label, "label"),
-                                  CLOSURE_FINISH_PARAM(GenericEmissiveClosure)};
-  return params;
-}
-
-CCLOSURE_PREPARE(closure_emission_prepare, GenericEmissiveClosure)
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/globals.cpp b/intern/cycles/kernel/osl/globals.cpp
new file mode 100644
index 00000000000..92b91182178
--- /dev/null
+++ b/intern/cycles/kernel/osl/globals.cpp
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include <OSL/oslexec.h>
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/types.h"
+
+#include "kernel/osl/globals.h"
+#include "kernel/osl/services.h"
+
+CCL_NAMESPACE_BEGIN
+
+void OSLGlobals::thread_init(KernelGlobalsCPU *kg, OSLGlobals *osl_globals)
+{
+  /* no osl used? */
+  if (!osl_globals->use) {
+    kg->osl = NULL;
+    return;
+  }
+
+  /* Per thread kernel data init. */
+  kg->osl = osl_globals;
+
+  OSL::ShadingSystem *ss = kg->osl->ss;
+  OSLThreadData *tdata = new OSLThreadData();
+
+  memset((void *)&tdata->globals, 0, sizeof(OSL::ShaderGlobals));
+  tdata->globals.tracedata = &tdata->tracedata;
+  tdata->osl_thread_info = ss->create_thread_info();
+  tdata->context = ss->get_context(tdata->osl_thread_info);
+
+  tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
+
+  kg->osl_ss = (OSLShadingSystem *)ss;
+  kg->osl_tdata = tdata;
+}
+
+void OSLGlobals::thread_free(KernelGlobalsCPU *kg)
+{
+  if (!kg->osl)
+    return;
+
+  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
+  OSLThreadData *tdata = kg->osl_tdata;
+  ss->release_context(tdata->context);
+
+  ss->destroy_thread_info(tdata->osl_thread_info);
+
+  delete tdata;
+
+  kg->osl = NULL;
+  kg->osl_ss = NULL;
+  kg->osl_tdata = NULL;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/globals.h b/intern/cycles/kernel/osl/globals.h
index 172091c55f5..2b002a0033e 100644
--- a/intern/cycles/kernel/osl/globals.h
+++ b/intern/cycles/kernel/osl/globals.h
@@ -41,6 +41,10 @@ struct OSLGlobals {
     use = false;
   }
 
+  /* per thread data */
+  static void thread_init(struct KernelGlobalsCPU *kg, OSLGlobals *osl_globals);
+  static void thread_free(struct KernelGlobalsCPU *kg);
+
   bool use;
 
   /* shading system */
@@ -56,16 +60,8 @@ struct OSLGlobals {
   OSL::ShaderGroupRef background_state;
 
   /* attributes */
-  struct Attribute {
-    TypeDesc type;
-    AttributeDescriptor desc;
-    ParamValue value;
-  };
-
-  typedef unordered_map<ustring, Attribute, ustringHash> AttributeMap;
   typedef unordered_map<ustring, int, ustringHash> ObjectNameMap;
 
-  vector<AttributeMap> attribute_map;
   ObjectNameMap object_name_map;
   vector<ustring> object_names;
 };
diff --git a/intern/cycles/kernel/osl/shader.h b/intern/cycles/kernel/osl/osl.h
index f0ab49dd6a8..bef23f3eea1 100644
--- a/intern/cycles/kernel/osl/shader.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -1,10 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
-#ifndef __OSL_SHADER_H__
-#define __OSL_SHADER_H__
-
-#ifdef WITH_OSL
+#pragma once
 
 /* OSL Shader Engine
  *
@@ -16,30 +13,12 @@
  * This means no thread state must be passed along in the kernel itself.
  */
 
-#  include "kernel/types.h"
+#include "kernel/osl/types.h"
 
 CCL_NAMESPACE_BEGIN
 
-class Scene;
-
-struct ShaderClosure;
-struct ShaderData;
-struct IntegratorStateCPU;
-struct differential3;
-struct KernelGlobalsCPU;
-
-struct OSLGlobals;
-struct OSLShadingSystem;
-
 class OSLShader {
  public:
-  /* init */
-  static void register_closures(OSLShadingSystem *ss);
-
-  /* per thread data */
-  static void thread_init(KernelGlobalsCPU *kg, OSLGlobals *osl_globals);
-  static void thread_free(KernelGlobalsCPU *kg);
-
   /* eval */
   static void eval_surface(const KernelGlobalsCPU *kg,
                            const void *state,
@@ -54,16 +33,6 @@ class OSLShader {
                           ShaderData *sd,
                           uint32_t path_flag);
   static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);
-
-  /* attributes */
-  static int find_attribute(const KernelGlobalsCPU *kg,
-                            const ShaderData *sd,
-                            uint id,
-                            AttributeDescriptor *desc);
 };
 
 CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __OSL_SHADER_H__ */
diff --git a/intern/cycles/kernel/osl/services.cpp b/intern/cycles/kernel/osl/services.cpp
index 6e75ae54f33..b744422ee78 100644
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -18,22 +18,17 @@
 #include "scene/pointcloud.h"
 #include "scene/scene.h"
 
-#include "kernel/osl/closures.h"
 #include "kernel/osl/globals.h"
 #include "kernel/osl/services.h"
-#include "kernel/osl/shader.h"
 
 #include "util/foreach.h"
 #include "util/log.h"
 #include "util/string.h"
 
-// clang-format off
 #include "kernel/device/cpu/compat.h"
 #include "kernel/device/cpu/globals.h"
 #include "kernel/device/cpu/image.h"
 
-#include "kernel/util/differential.h"
-
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 
@@ -45,10 +40,10 @@
 #include "kernel/camera/projection.h"
 
 #include "kernel/integrator/path_state.h"
-#include "kernel/integrator/shader_eval.h"
+
+#include "kernel/svm/svm.h"
 
 #include "kernel/util/color.h"
-// clang-format on
 
 CCL_NAMESPACE_BEGIN
 
@@ -125,14 +120,14 @@ ustring OSLRenderServices::u_v("v");
 ustring OSLRenderServices::u_empty;
 
 OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system)
-    : texture_system(texture_system)
+    : OSL::RendererServices(texture_system)
 {
 }
 
 OSLRenderServices::~OSLRenderServices()
 {
-  if (texture_system) {
-    VLOG_INFO << "OSL texture system stats:\n" << texture_system->getstats();
+  if (m_texturesys) {
+    VLOG_INFO << "OSL texture system stats:\n" << m_texturesys->getstats();
   }
 }
 
@@ -452,6 +447,7 @@ static bool set_attribute_float2(float2 f[3], TypeDesc type, bool derivatives, v
   return false;
 }
 
+#if 0
 static bool set_attribute_float2(float2 f, TypeDesc type, bool derivatives, void *val)
 {
   float2 fv[3];
@@ -462,6 +458,7 @@ static bool set_attribute_float2(float2 f, TypeDesc type, bool derivatives, void
 
   return set_attribute_float2(fv, type, derivatives, val);
 }
+#endif
 
 static bool set_attribute_float3(float3 f[3], TypeDesc type, bool derivatives, void *val)
 {
@@ -590,6 +587,7 @@ static bool set_attribute_float4(float4 f[3], TypeDesc type, bool derivatives, v
   return false;
 }
 
+#if 0
 static bool set_attribute_float4(float4 f, TypeDesc type, bool derivatives, void *val)
 {
   float4 fv[3];
@@ -600,6 +598,7 @@ static bool set_attribute_float4(float4 f, TypeDesc type, bool derivatives, void
 
   return set_attribute_float4(fv, type, derivatives, val);
 }
+#endif
 
 static bool set_attribute_float(float f[3], TypeDesc type, bool derivatives, void *val)
 {
@@ -741,76 +740,75 @@ static bool set_attribute_matrix(const Transform &tfm, TypeDesc type, void *val)
   return false;
 }
 
-static bool get_primitive_attribute(const KernelGlobalsCPU *kg,
-                                    const ShaderData *sd,
-                                    const OSLGlobals::Attribute &attr,
-                                    const TypeDesc &type,
-                                    bool derivatives,
-                                    void *val)
+static bool get_object_attribute(const KernelGlobalsCPU *kg,
+                                 ShaderData *sd,
+                                 const AttributeDescriptor &desc,
+                                 const TypeDesc &type,
+                                 bool derivatives,
+                                 void *val)
 {
-  if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
-      attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) {
+  if (desc.type == NODE_ATTR_FLOAT3) {
     float3 fval[3];
-    if (primitive_is_volume_attribute(sd, attr.desc)) {
-      fval[0] = primitive_volume_attribute_float3(kg, sd, attr.desc);
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc)) {
+      fval[0] = primitive_volume_attribute_float3(kg, sd, desc);
     }
-    else {
+    else
+#endif
+    {
       memset(fval, 0, sizeof(fval));
       fval[0] = primitive_surface_attribute_float3(
-          kg, sd, attr.desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
+          kg, sd, desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
     }
     return set_attribute_float3(fval, type, derivatives, val);
   }
-  else if (attr.type == TypeFloat2) {
-    if (primitive_is_volume_attribute(sd, attr.desc)) {
+  else if (desc.type == NODE_ATTR_FLOAT2) {
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc)) {
       assert(!"Float2 attribute not support for volumes");
       return false;
     }
-    else {
+    else
+#endif
+    {
       float2 fval[3];
       fval[0] = primitive_surface_attribute_float2(
-          kg, sd, attr.desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
+          kg, sd, desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
       return set_attribute_float2(fval, type, derivatives, val);
     }
   }
-  else if (attr.type == TypeDesc::TypeFloat) {
+  else if (desc.type == NODE_ATTR_FLOAT) {
     float fval[3];
-    if (primitive_is_volume_attribute(sd, attr.desc)) {
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc)) {
       memset(fval, 0, sizeof(fval));
-      fval[0] = primitive_volume_attribute_float(kg, sd, attr.desc);
+      fval[0] = primitive_volume_attribute_float(kg, sd, desc);
     }
-    else {
+    else
+#endif
+    {
       fval[0] = primitive_surface_attribute_float(
-          kg, sd, attr.desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
+          kg, sd, desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
     }
     return set_attribute_float(fval, type, derivatives, val);
   }
-  else if (attr.type == TypeDesc::TypeFloat4 || attr.type == TypeRGBA) {
+  else if (desc.type == NODE_ATTR_FLOAT4 || desc.type == NODE_ATTR_RGBA) {
     float4 fval[3];
-    if (primitive_is_volume_attribute(sd, attr.desc)) {
+#ifdef __VOLUME__
+    if (primitive_is_volume_attribute(sd, desc)) {
       memset(fval, 0, sizeof(fval));
-      fval[0] = primitive_volume_attribute_float4(kg, sd, attr.desc);
+      fval[0] = primitive_volume_attribute_float4(kg, sd, desc);
     }
-    else {
+    else
+#endif
+    {
       fval[0] = primitive_surface_attribute_float4(
-          kg, sd, attr.desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
+          kg, sd, desc, (derivatives) ? &fval[1] : NULL, (derivatives) ? &fval[2] : NULL);
     }
     return set_attribute_float4(fval, type, derivatives, val);
   }
-  else {
-    return false;
-  }
-}
-
-static bool get_mesh_attribute(const KernelGlobalsCPU *kg,
-                               const ShaderData *sd,
-                               const OSLGlobals::Attribute &attr,
-                               const TypeDesc &type,
-                               bool derivatives,
-                               void *val)
-{
-  if (attr.type == TypeDesc::TypeMatrix) {
-    Transform tfm = primitive_attribute_matrix(kg, sd, attr.desc);
+  else if (desc.type == NODE_ATTR_MATRIX) {
+    Transform tfm = primitive_attribute_matrix(kg, desc);
     return set_attribute_matrix(tfm, type, val);
   }
   else {
@@ -818,44 +816,6 @@ static bool get_mesh_attribute(const KernelGlobalsCPU *kg,
   }
 }
 
-static bool get_object_attribute(const OSLGlobals::Attribute &attr,
-                                 TypeDesc type,
-                                 bool derivatives,
-                                 void *val)
-{
-  if (attr.type == TypeDesc::TypePoint || attr.type == TypeDesc::TypeVector ||
-      attr.type == TypeDesc::TypeNormal || attr.type == TypeDesc::TypeColor) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float3(make_float3(data[0], data[1], data[2]), type, derivatives, val);
-  }
-  else if (attr.type == TypeFloat2) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float2(make_float2(data[0], data[1]), type, derivatives, val);
-  }
-  else if (attr.type == TypeDesc::TypeFloat) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float(data[0], type, derivatives, val);
-  }
-  else if (attr.type == TypeRGBA || attr.type == TypeDesc::TypeFloat4) {
-    const float *data = (const float *)attr.value.data();
-    return set_attribute_float4(
-        make_float4(data[0], data[1], data[2], data[3]), type, derivatives, val);
-  }
-  else if (attr.type == type) {
-    size_t datasize = attr.value.datasize();
-
-    memcpy(val, attr.value.data(), datasize);
-    if (derivatives) {
-      memset((char *)val + datasize, 0, datasize * 2);
-    }
-
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-
 bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg,
                                                       ShaderData *sd,
                                                       ustring name,
@@ -980,6 +940,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
     float f = ((sd->shader & SHADER_SMOOTH_NORMAL) != 0);
     return set_attribute_float(f, type, derivatives, val);
   }
+#ifdef __HAIR__
   /* Hair Attributes */
   else if (name == u_is_curve) {
     float f = (sd->type & PRIMITIVE_CURVE) != 0;
@@ -997,6 +958,8 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
     float f = curve_random(kg, sd);
     return set_attribute_float(f, type, derivatives, val);
   }
+#endif
+#ifdef __POINTCLOUD__
   /* point attributes */
   else if (name == u_is_point) {
     float f = (sd->type & PRIMITIVE_POINT) != 0;
@@ -1014,6 +977,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
     float f = point_random(kg, sd);
     return set_attribute_float(f, type, derivatives, val);
   }
+#endif
   else if (name == u_normal_map_normal) {
     if (sd->type & PRIMITIVE_TRIANGLE) {
       float3 f = triangle_smooth_normal_unnormalized(kg, sd, sd->Ng, sd->prim, sd->u, sd->v);
@@ -1024,7 +988,7 @@ bool OSLRenderServices::get_object_standard_attribute(const KernelGlobalsCPU *kg
     }
   }
   else {
-    return false;
+    return get_background_attribute(kg, sd, name, type, derivatives, val);
   }
 }
 
@@ -1094,18 +1058,17 @@ bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
       ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);
 
       if (derivatives) {
-        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) -
-                 ndc[0];
-        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) -
-                 ndc[0];
+        ndc[1] = zero_float3();
+        ndc[2] = zero_float3();
       }
     }
     else {
       ndc[0] = camera_world_to_ndc(kg, sd, sd->P);
 
       if (derivatives) {
-        ndc[1] = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx) - ndc[0];
-        ndc[2] = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy) - ndc[0];
+        const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+        ndc[1] = camera_world_to_ndc(kg, sd, sd->P + dP.dx) - ndc[0];
+        ndc[2] = camera_world_to_ndc(kg, sd, sd->P + dP.dy) - ndc[0];
       }
     }
 
@@ -1133,7 +1096,6 @@ bool OSLRenderServices::get_attribute(
     ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val)
 {
   const KernelGlobalsCPU *kg = sd->osl_globals;
-  int prim_type = 0;
   int object;
 
   /* lookup of attribute on another object */
@@ -1147,44 +1109,18 @@ bool OSLRenderServices::get_attribute(
   }
   else {
     object = sd->object;
-    prim_type = attribute_primitive_type(kg, sd);
-
-    if (object == OBJECT_NONE)
-      return get_background_attribute(kg, sd, name, type, derivatives, val);
   }
 
   /* find attribute on object */
-  object = object * ATTR_PRIM_TYPES + prim_type;
-  OSLGlobals::AttributeMap &attribute_map = kg->osl->attribute_map[object];
-  OSLGlobals::AttributeMap::iterator it = attribute_map.find(name);
-
-  if (it != attribute_map.end()) {
-    const OSLGlobals::Attribute &attr = it->second;
-
-    if (attr.desc.element != ATTR_ELEMENT_OBJECT) {
-      /* triangle and vertex attributes */
-      if (get_primitive_attribute(kg, sd, attr, type, derivatives, val))
-        return true;
-      else
-        return get_mesh_attribute(kg, sd, attr, type, derivatives, val);
-    }
-    else {
-      /* object attribute */
-      return get_object_attribute(attr, type, derivatives, val);
-    }
+  const AttributeDescriptor desc = find_attribute(
+      kg, object, sd->prim, object == sd->object ? sd->type : PRIMITIVE_NONE, name.hash());
+  if (desc.offset != ATTR_STD_NOT_FOUND) {
+    return get_object_attribute(kg, sd, desc, type, derivatives, val);
   }
   else {
     /* not found in attribute, check standard object info */
-    bool is_std_object_attribute = get_object_standard_attribute(
-        kg, sd, name, type, derivatives, val);
-
-    if (is_std_object_attribute)
-      return true;
-
-    return get_background_attribute(kg, sd, name, type, derivatives, val);
+    return get_object_standard_attribute(kg, sd, name, type, derivatives, val);
   }
-
-  return false;
 }
 
 bool OSLRenderServices::get_userdata(
@@ -1211,7 +1147,7 @@ TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring file
   }
 
   /* Get handle from OpenImageIO. */
-  OSL::TextureSystem *ts = texture_system;
+  OSL::TextureSystem *ts = m_texturesys;
   TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
   if (handle == NULL) {
     return NULL;
@@ -1233,7 +1169,7 @@ bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
   OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
 
   if (handle->oiio_handle) {
-    OSL::TextureSystem *ts = texture_system;
+    OSL::TextureSystem *ts = m_texturesys;
     return ts->good(handle->oiio_handle);
   }
   else {
@@ -1355,7 +1291,7 @@ bool OSLRenderServices::texture(ustring filename,
     }
     case OSLTextureHandle::OIIO: {
       /* OpenImageIO texture cache. */
-      OSL::TextureSystem *ts = texture_system;
+      OSL::TextureSystem *ts = m_texturesys;
 
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == NULL) {
@@ -1459,7 +1395,7 @@ bool OSLRenderServices::texture3d(ustring filename,
     }
     case OSLTextureHandle::OIIO: {
       /* OpenImageIO texture cache. */
-      OSL::TextureSystem *ts = texture_system;
+      OSL::TextureSystem *ts = m_texturesys;
 
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == NULL) {
@@ -1543,7 +1479,7 @@ bool OSLRenderServices::environment(ustring filename,
                                     ustring *errormessage)
 {
   OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
-  OSL::TextureSystem *ts = texture_system;
+  OSL::TextureSystem *ts = m_texturesys;
   bool status = false;
 
   if (handle && handle->oiio_handle) {
@@ -1615,7 +1551,7 @@ bool OSLRenderServices::get_texture_info(OSL::ShaderGlobals *sg,
   }
 
   /* Get texture info from OpenImageIO. */
-  OSL::TextureSystem *ts = texture_system;
+  OSL::TextureSystem *ts = m_texturesys;
   return ts->get_texture_info(filename, subimage, dataname, datatype, data);
 }
 
@@ -1669,9 +1605,10 @@ bool OSLRenderServices::trace(TraceOpt &options,
   /* setup ray */
   Ray ray;
 
-  ray.P = TO_FLOAT3(P);
-  ray.D = TO_FLOAT3(R);
-  ray.t = (options.maxdist == 1.0e30f) ? FLT_MAX : options.maxdist - options.mindist;
+  ray.P = make_float3(P.x, P.y, P.z);
+  ray.D = make_float3(R.x, R.y, R.z);
+  ray.tmin = 0.0f;
+  ray.tmax = (options.maxdist == 1.0e30f) ? FLT_MAX : options.maxdist - options.mindist;
   ray.time = sd->time;
   ray.self.object = OBJECT_NONE;
   ray.self.prim = PRIM_NONE;
@@ -1692,12 +1629,12 @@ bool OSLRenderServices::trace(TraceOpt &options,
 
   /* ray differentials */
   differential3 dP;
-  dP.dx = TO_FLOAT3(dPdx);
-  dP.dy = TO_FLOAT3(dPdy);
+  dP.dx = make_float3(dPdx.x, dPdx.y, dPdx.z);
+  dP.dy = make_float3(dPdy.x, dPdy.y, dPdy.z);
   ray.dP = differential_make_compact(dP);
   differential3 dD;
-  dD.dx = TO_FLOAT3(dRdx);
-  dD.dy = TO_FLOAT3(dRdy);
+  dD.dx = make_float3(dRdx.x, dRdx.y, dRdx.z);
+  dD.dy = make_float3(dRdy.x, dRdy.y, dRdy.z);
   ray.dD = differential_make_compact(dD);
 
   /* allocate trace data */
@@ -1710,12 +1647,12 @@ bool OSLRenderServices::trace(TraceOpt &options,
 
   const KernelGlobalsCPU *kg = sd->osl_globals;
 
-  /* Can't raytrace from shaders like displacement, before BVH exists. */
+  /* Can't ray-trace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
     return false;
   }
 
-  /* Raytrace, leaving out shadow opaque to avoid early exit. */
+  /* Ray-trace, leaving out shadow opaque to avoid early exit. */
   uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE;
   tracedata->hit = scene_intersect(kg, &ray, visibility, &tracedata->isect);
   return tracedata->hit;
@@ -1756,11 +1693,13 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
           return set_attribute_float3(sd->Ng, type, derivatives, val);
         }
         else if (name == u_P) {
-          float3 f[3] = {sd->P, sd->dP.dx, sd->dP.dy};
+          const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+          float3 f[3] = {sd->P, dP.dx, dP.dy};
           return set_attribute_float3(f, type, derivatives, val);
         }
         else if (name == u_I) {
-          float3 f[3] = {sd->I, sd->dI.dx, sd->dI.dy};
+          const differential3 dI = differential_from_compact(sd->I, sd->dI);
+          float3 f[3] = {sd->I, dI.dx, dI.dy};
           return set_attribute_float3(f, type, derivatives, val);
         }
         else if (name == u_u) {
diff --git a/intern/cycles/kernel/osl/services.h b/intern/cycles/kernel/osl/services.h
index edffd912bad..334b6682e34 100644
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -76,6 +76,8 @@ class OSLRenderServices : public OSL::RendererServices {
   OSLRenderServices(OSL::TextureSystem *texture_system);
   ~OSLRenderServices();
 
+  static void register_closures(OSL::ShadingSystem *ss);
+
   bool get_matrix(OSL::ShaderGlobals *sg,
                   OSL::Matrix44 &result,
                   OSL::TransformationPtr xform,
@@ -321,7 +323,6 @@ class OSLRenderServices : public OSL::RendererServices {
    * globals to be shared between different render sessions. This saves memory,
    * and is required because texture handles are cached as part of the shared
    * shading system. */
-  OSL::TextureSystem *texture_system;
   OSLTextureHandleMap textures;
 };
 
diff --git a/intern/cycles/kernel/osl/shader.cpp b/intern/cycles/kernel/osl/shader.cpp
deleted file mode 100644
index af96c0070e3..00000000000
--- a/intern/cycles/kernel/osl/shader.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#include <OSL/oslexec.h>
-
-// clang-format off
-#include "kernel/device/cpu/compat.h"
-#include "kernel/device/cpu/globals.h"
-
-#include "kernel/types.h"
-
-#include "kernel/geom/object.h"
-
-#include "kernel/integrator/state.h"
-
-#include "kernel/osl/closures.h"
-#include "kernel/osl/globals.h"
-#include "kernel/osl/services.h"
-#include "kernel/osl/shader.h"
-// clang-format on
-
-#include "scene/attribute.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Threads */
-
-void OSLShader::thread_init(KernelGlobalsCPU *kg, OSLGlobals *osl_globals)
-{
-  /* no osl used? */
-  if (!osl_globals->use) {
-    kg->osl = NULL;
-    return;
-  }
-
-  /* Per thread kernel data init. */
-  kg->osl = osl_globals;
-
-  OSL::ShadingSystem *ss = kg->osl->ss;
-  OSLThreadData *tdata = new OSLThreadData();
-
-  memset((void *)&tdata->globals, 0, sizeof(OSL::ShaderGlobals));
-  tdata->globals.tracedata = &tdata->tracedata;
-  tdata->globals.flipHandedness = false;
-  tdata->osl_thread_info = ss->create_thread_info();
-  tdata->context = ss->get_context(tdata->osl_thread_info);
-
-  tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
-
-  kg->osl_ss = (OSLShadingSystem *)ss;
-  kg->osl_tdata = tdata;
-}
-
-void OSLShader::thread_free(KernelGlobalsCPU *kg)
-{
-  if (!kg->osl)
-    return;
-
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSLThreadData *tdata = kg->osl_tdata;
-  ss->release_context(tdata->context);
-
-  ss->destroy_thread_info(tdata->osl_thread_info);
-
-  delete tdata;
-
-  kg->osl = NULL;
-  kg->osl_ss = NULL;
-  kg->osl_tdata = NULL;
-}
-
-/* Globals */
-
-static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
-                                        ShaderData *sd,
-                                        const void *state,
-                                        uint32_t path_flag,
-                                        OSLThreadData *tdata)
-{
-  OSL::ShaderGlobals *globals = &tdata->globals;
-
-  /* copy from shader data to shader globals */
-  globals->P = TO_VEC3(sd->P);
-  globals->dPdx = TO_VEC3(sd->dP.dx);
-  globals->dPdy = TO_VEC3(sd->dP.dy);
-  globals->I = TO_VEC3(sd->I);
-  globals->dIdx = TO_VEC3(sd->dI.dx);
-  globals->dIdy = TO_VEC3(sd->dI.dy);
-  globals->N = TO_VEC3(sd->N);
-  globals->Ng = TO_VEC3(sd->Ng);
-  globals->u = sd->u;
-  globals->dudx = sd->du.dx;
-  globals->dudy = sd->du.dy;
-  globals->v = sd->v;
-  globals->dvdx = sd->dv.dx;
-  globals->dvdy = sd->dv.dy;
-  globals->dPdu = TO_VEC3(sd->dPdu);
-  globals->dPdv = TO_VEC3(sd->dPdv);
-  globals->surfacearea = 1.0f;
-  globals->time = sd->time;
-
-  /* booleans */
-  globals->raytype = path_flag;
-  globals->backfacing = (sd->flag & SD_BACKFACING);
-
-  /* shader data to be used in services callbacks */
-  globals->renderstate = sd;
-
-  /* hacky, we leave it to services to fetch actual object matrix */
-  globals->shader2common = sd;
-  globals->object2common = sd;
-
-  /* must be set to NULL before execute */
-  globals->Ci = NULL;
-
-  /* clear trace data */
-  tdata->tracedata.init = false;
-
-  /* Used by render-services. */
-  sd->osl_globals = kg;
-  if (path_flag & PATH_RAY_SHADOW) {
-    sd->osl_path_state = nullptr;
-    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
-  }
-  else {
-    sd->osl_path_state = (const IntegratorStateCPU *)state;
-    sd->osl_shadow_path_state = nullptr;
-  }
-}
-
-/* Surface */
-
-static void flatten_surface_closure_tree(ShaderData *sd,
-                                         uint32_t path_flag,
-                                         const OSL::ClosureColor *closure,
-                                         float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
-  /* OSL gives us a closure tree, we flatten it into arrays per
-   * closure type, for evaluation, sampling, etc later on. */
-
-  switch (closure->id) {
-    case OSL::ClosureColor::MUL: {
-      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-      flatten_surface_closure_tree(sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
-      break;
-    }
-    case OSL::ClosureColor::ADD: {
-      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-      flatten_surface_closure_tree(sd, path_flag, add->closureA, weight);
-      flatten_surface_closure_tree(sd, path_flag, add->closureB, weight);
-      break;
-    }
-    default: {
-      OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-      CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
-
-      if (prim) {
-#ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-        weight = weight * TO_FLOAT3(comp->w);
-#endif
-        prim->setup(sd, path_flag, weight);
-      }
-      break;
-    }
-  }
-}
-
-void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
-                             const void *state,
-                             ShaderData *sd,
-                             uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
-
-  /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
-  int shader = sd->shader & SHADER_MASK;
-
-  /* automatic bump shader */
-  if (kg->osl->bump_state[shader]) {
-    /* save state */
-    float3 P = sd->P;
-    float3 dPdx = sd->dP.dx;
-    float3 dPdy = sd->dP.dy;
-
-    /* set state as if undisplaced */
-    if (sd->flag & SD_HAS_DISPLACEMENT) {
-      float data[9];
-      bool found = kg->osl->services->get_attribute(sd,
-                                                    true,
-                                                    OSLRenderServices::u_empty,
-                                                    TypeDesc::TypeVector,
-                                                    OSLRenderServices::u_geom_undisplaced,
-                                                    data);
-      (void)found;
-      assert(found);
-
-      memcpy(&sd->P, data, sizeof(float) * 3);
-      memcpy(&sd->dP.dx, data + 3, sizeof(float) * 3);
-      memcpy(&sd->dP.dy, data + 6, sizeof(float) * 3);
-
-      object_position_transform(kg, sd, &sd->P);
-      object_dir_transform(kg, sd, &sd->dP.dx);
-      object_dir_transform(kg, sd, &sd->dP.dy);
-
-      globals->P = TO_VEC3(sd->P);
-      globals->dPdx = TO_VEC3(sd->dP.dx);
-      globals->dPdy = TO_VEC3(sd->dP.dy);
-    }
-
-    /* execute bump shader */
-    ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
-
-    /* reset state */
-    sd->P = P;
-    sd->dP.dx = dPdx;
-    sd->dP.dy = dPdy;
-
-    globals->P = TO_VEC3(P);
-    globals->dPdx = TO_VEC3(dPdx);
-    globals->dPdy = TO_VEC3(dPdy);
-  }
-
-  /* surface shader */
-  if (kg->osl->surface_state[shader]) {
-    ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
-  }
-
-  /* flatten closure tree */
-  if (globals->Ci)
-    flatten_surface_closure_tree(sd, path_flag, globals->Ci);
-}
-
-/* Background */
-
-static void flatten_background_closure_tree(ShaderData *sd,
-                                            const OSL::ClosureColor *closure,
-                                            float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
-  /* OSL gives us a closure tree, if we are shading for background there
-   * is only one supported closure type at the moment, which has no evaluation
-   * functions, so we just sum the weights */
-
-  switch (closure->id) {
-    case OSL::ClosureColor::MUL: {
-      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-      flatten_background_closure_tree(sd, mul->closure, weight * TO_FLOAT3(mul->weight));
-      break;
-    }
-    case OSL::ClosureColor::ADD: {
-      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-
-      flatten_background_closure_tree(sd, add->closureA, weight);
-      flatten_background_closure_tree(sd, add->closureB, weight);
-      break;
-    }
-    default: {
-      OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-      CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
-
-      if (prim) {
-#ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-        weight = weight * TO_FLOAT3(comp->w);
-#endif
-        prim->setup(sd, 0, weight);
-      }
-      break;
-    }
-  }
-}
-
-void OSLShader::eval_background(const KernelGlobalsCPU *kg,
-                                const void *state,
-                                ShaderData *sd,
-                                uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
-
-  /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
-
-  if (kg->osl->background_state) {
-    ss->execute(octx, *(kg->osl->background_state), *globals);
-  }
-
-  /* return background color immediately */
-  if (globals->Ci)
-    flatten_background_closure_tree(sd, globals->Ci);
-}
-
-/* Volume */
-
-static void flatten_volume_closure_tree(ShaderData *sd,
-                                        const OSL::ClosureColor *closure,
-                                        float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
-  /* OSL gives us a closure tree, we flatten it into arrays per
-   * closure type, for evaluation, sampling, etc later on. */
-
-  switch (closure->id) {
-    case OSL::ClosureColor::MUL: {
-      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-      flatten_volume_closure_tree(sd, mul->closure, TO_FLOAT3(mul->weight) * weight);
-      break;
-    }
-    case OSL::ClosureColor::ADD: {
-      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-      flatten_volume_closure_tree(sd, add->closureA, weight);
-      flatten_volume_closure_tree(sd, add->closureB, weight);
-      break;
-    }
-    default: {
-      OSL::ClosureComponent *comp = (OSL::ClosureComponent *)closure;
-      CClosurePrimitive *prim = (CClosurePrimitive *)comp->data();
-
-      if (prim) {
-#ifdef OSL_SUPPORTS_WEIGHTED_CLOSURE_COMPONENTS
-        weight = weight * TO_FLOAT3(comp->w);
-#endif
-        prim->setup(sd, 0, weight);
-      }
-    }
-  }
-}
-
-void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
-                            const void *state,
-                            ShaderData *sd,
-                            uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
-
-  /* execute shader */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
-  int shader = sd->shader & SHADER_MASK;
-
-  if (kg->osl->volume_state[shader]) {
-    ss->execute(octx, *(kg->osl->volume_state[shader]), *globals);
-  }
-
-  /* flatten closure tree */
-  if (globals->Ci)
-    flatten_volume_closure_tree(sd, globals->Ci);
-}
-
-/* Displacement */
-
-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-
-  shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
-
-  /* execute shader */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
-  int shader = sd->shader & SHADER_MASK;
-
-  if (kg->osl->displacement_state[shader]) {
-    ss->execute(octx, *(kg->osl->displacement_state[shader]), *globals);
-  }
-
-  /* get back position */
-  sd->P = TO_FLOAT3(globals->P);
-}
-
-/* Attributes */
-
-int OSLShader::find_attribute(const KernelGlobalsCPU *kg,
-                              const ShaderData *sd,
-                              uint id,
-                              AttributeDescriptor *desc)
-{
-  /* for OSL, a hash map is used to lookup the attribute by name. */
-  int object = sd->object * ATTR_PRIM_TYPES;
-
-  OSLGlobals::AttributeMap &attr_map = kg->osl->attribute_map[object];
-  ustring stdname(std::string("geom:") +
-                  std::string(Attribute::standard_name((AttributeStandard)id)));
-  OSLGlobals::AttributeMap::const_iterator it = attr_map.find(stdname);
-
-  if (it != attr_map.end()) {
-    const OSLGlobals::Attribute &osl_attr = it->second;
-    *desc = osl_attr.desc;
-
-    if (sd->prim == PRIM_NONE && (AttributeElement)osl_attr.desc.element != ATTR_ELEMENT_MESH) {
-      desc->offset = ATTR_STD_NOT_FOUND;
-      return ATTR_STD_NOT_FOUND;
-    }
-
-    /* return result */
-    if (osl_attr.desc.element == ATTR_ELEMENT_NONE) {
-      desc->offset = ATTR_STD_NOT_FOUND;
-    }
-    return desc->offset;
-  }
-  else {
-    desc->offset = ATTR_STD_NOT_FOUND;
-    return (int)ATTR_STD_NOT_FOUND;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/osl/shaders/CMakeLists.txt b/intern/cycles/kernel/osl/shaders/CMakeLists.txt
index 741bce7c399..c79af3f6112 100644
--- a/intern/cycles/kernel/osl/shaders/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/shaders/CMakeLists.txt
@@ -57,6 +57,10 @@ set(SRC_OSL
   node_math.osl
   node_mix.osl
   node_mix_closure.osl
+  node_mix_color.osl
+  node_mix_float.osl
+  node_mix_vector.osl
+  node_mix_vector_non_uniform.osl
   node_musgrave_texture.osl
   node_noise_texture.osl
   node_normal.osl
@@ -109,6 +113,7 @@ file(GLOB SRC_OSL_HEADER_DIST ${OSL_SHADER_DIR}/*.h)
 
 set(SRC_OSL_HEADERS
   node_color.h
+  node_color_blend.h
   node_fresnel.h
   node_hash.h
   node_math.h
diff --git a/intern/cycles/kernel/osl/shaders/node_color_blend.h b/intern/cycles/kernel/osl/shaders/node_color_blend.h
new file mode 100644
index 00000000000..ab4b4809a97
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_color_blend.h
@@ -0,0 +1,264 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+color node_mix_blend(float t, color col1, color col2)
+{
+  return mix(col1, col2, t);
+}
+
+color node_mix_add(float t, color col1, color col2)
+{
+  return mix(col1, col1 + col2, t);
+}
+
+color node_mix_mul(float t, color col1, color col2)
+{
+  return mix(col1, col1 * col2, t);
+}
+
+color node_mix_screen(float t, color col1, color col2)
+{
+  float tm = 1.0 - t;
+
+  return color(1.0) - (color(tm) + t * (color(1.0) - col2)) * (color(1.0) - col1);
+}
+
+color node_mix_overlay(float t, color col1, color col2)
+{
+  float tm = 1.0 - t;
+
+  color outcol = col1;
+
+  if (outcol[0] < 0.5)
+    outcol[0] *= tm + 2.0 * t * col2[0];
+  else
+    outcol[0] = 1.0 - (tm + 2.0 * t * (1.0 - col2[0])) * (1.0 - outcol[0]);
+
+  if (outcol[1] < 0.5)
+    outcol[1] *= tm + 2.0 * t * col2[1];
+  else
+    outcol[1] = 1.0 - (tm + 2.0 * t * (1.0 - col2[1])) * (1.0 - outcol[1]);
+
+  if (outcol[2] < 0.5)
+    outcol[2] *= tm + 2.0 * t * col2[2];
+  else
+    outcol[2] = 1.0 - (tm + 2.0 * t * (1.0 - col2[2])) * (1.0 - outcol[2]);
+
+  return outcol;
+}
+
+color node_mix_sub(float t, color col1, color col2)
+{
+  return mix(col1, col1 - col2, t);
+}
+
+color node_mix_div(float t, color col1, color col2)
+{
+  float tm = 1.0 - t;
+
+  color outcol = col1;
+
+  if (col2[0] != 0.0)
+    outcol[0] = tm * outcol[0] + t * outcol[0] / col2[0];
+  if (col2[1] != 0.0)
+    outcol[1] = tm * outcol[1] + t * outcol[1] / col2[1];
+  if (col2[2] != 0.0)
+    outcol[2] = tm * outcol[2] + t * outcol[2] / col2[2];
+
+  return outcol;
+}
+
+color node_mix_diff(float t, color col1, color col2)
+{
+  return mix(col1, abs(col1 - col2), t);
+}
+
+color node_mix_dark(float t, color col1, color col2)
+{
+  return mix(col1, min(col1, col2), t);
+}
+
+color node_mix_light(float t, color col1, color col2)
+{
+  return mix(col1, max(col1, col2), t);
+}
+
+color node_mix_dodge(float t, color col1, color col2)
+{
+  color outcol = col1;
+
+  if (outcol[0] != 0.0) {
+    float tmp = 1.0 - t * col2[0];
+    if (tmp <= 0.0)
+      outcol[0] = 1.0;
+    else if ((tmp = outcol[0] / tmp) > 1.0)
+      outcol[0] = 1.0;
+    else
+      outcol[0] = tmp;
+  }
+  if (outcol[1] != 0.0) {
+    float tmp = 1.0 - t * col2[1];
+    if (tmp <= 0.0)
+      outcol[1] = 1.0;
+    else if ((tmp = outcol[1] / tmp) > 1.0)
+      outcol[1] = 1.0;
+    else
+      outcol[1] = tmp;
+  }
+  if (outcol[2] != 0.0) {
+    float tmp = 1.0 - t * col2[2];
+    if (tmp <= 0.0)
+      outcol[2] = 1.0;
+    else if ((tmp = outcol[2] / tmp) > 1.0)
+      outcol[2] = 1.0;
+    else
+      outcol[2] = tmp;
+  }
+
+  return outcol;
+}
+
+color node_mix_burn(float t, color col1, color col2)
+{
+  float tmp, tm = 1.0 - t;
+
+  color outcol = col1;
+
+  tmp = tm + t * col2[0];
+  if (tmp <= 0.0)
+    outcol[0] = 0.0;
+  else if ((tmp = (1.0 - (1.0 - outcol[0]) / tmp)) < 0.0)
+    outcol[0] = 0.0;
+  else if (tmp > 1.0)
+    outcol[0] = 1.0;
+  else
+    outcol[0] = tmp;
+
+  tmp = tm + t * col2[1];
+  if (tmp <= 0.0)
+    outcol[1] = 0.0;
+  else if ((tmp = (1.0 - (1.0 - outcol[1]) / tmp)) < 0.0)
+    outcol[1] = 0.0;
+  else if (tmp > 1.0)
+    outcol[1] = 1.0;
+  else
+    outcol[1] = tmp;
+
+  tmp = tm + t * col2[2];
+  if (tmp <= 0.0)
+    outcol[2] = 0.0;
+  else if ((tmp = (1.0 - (1.0 - outcol[2]) / tmp)) < 0.0)
+    outcol[2] = 0.0;
+  else if (tmp > 1.0)
+    outcol[2] = 1.0;
+  else
+    outcol[2] = tmp;
+
+  return outcol;
+}
+
+color node_mix_hue(float t, color col1, color col2)
+{
+  color outcol = col1;
+  color hsv2 = rgb_to_hsv(col2);
+
+  if (hsv2[1] != 0.0) {
+    color hsv = rgb_to_hsv(outcol);
+    hsv[0] = hsv2[0];
+    color tmp = hsv_to_rgb(hsv);
+
+    outcol = mix(outcol, tmp, t);
+  }
+
+  return outcol;
+}
+
+color node_mix_sat(float t, color col1, color col2)
+{
+  float tm = 1.0 - t;
+
+  color outcol = col1;
+
+  color hsv = rgb_to_hsv(outcol);
+
+  if (hsv[1] != 0.0) {
+    color hsv2 = rgb_to_hsv(col2);
+
+    hsv[1] = tm * hsv[1] + t * hsv2[1];
+    outcol = hsv_to_rgb(hsv);
+  }
+
+  return outcol;
+}
+
+color node_mix_val(float t, color col1, color col2)
+{
+  float tm = 1.0 - t;
+
+  color hsv = rgb_to_hsv(col1);
+  color hsv2 = rgb_to_hsv(col2);
+
+  hsv[2] = tm * hsv[2] + t * hsv2[2];
+
+  return hsv_to_rgb(hsv);
+}
+
+color node_mix_color(float t, color col1, color col2)
+{
+  color outcol = col1;
+  color hsv2 = rgb_to_hsv(col2);
+
+  if (hsv2[1] != 0.0) {
+    color hsv = rgb_to_hsv(outcol);
+    hsv[0] = hsv2[0];
+    hsv[1] = hsv2[1];
+    color tmp = hsv_to_rgb(hsv);
+
+    outcol = mix(outcol, tmp, t);
+  }
+
+  return outcol;
+}
+
+color node_mix_soft(float t, color col1, color col2)
+{
+  float tm = 1.0 - t;
+
+  color one = color(1.0);
+  color scr = one - (one - col2) * (one - col1);
+
+  return tm * col1 + t * ((one - col1) * col2 * col1 + col1 * scr);
+}
+
+color node_mix_linear(float t, color col1, color col2)
+{
+  color outcol = col1;
+
+  if (col2[0] > 0.5)
+    outcol[0] = col1[0] + t * (2.0 * (col2[0] - 0.5));
+  else
+    outcol[0] = col1[0] + t * (2.0 * (col2[0]) - 1.0);
+
+  if (col2[1] > 0.5)
+    outcol[1] = col1[1] + t * (2.0 * (col2[1] - 0.5));
+  else
+    outcol[1] = col1[1] + t * (2.0 * (col2[1]) - 1.0);
+
+  if (col2[2] > 0.5)
+    outcol[2] = col1[2] + t * (2.0 * (col2[2] - 0.5));
+  else
+    outcol[2] = col1[2] + t * (2.0 * (col2[2]) - 1.0);
+
+  return outcol;
+}
+
+color node_mix_clamp(color col)
+{
+  color outcol = col;
+
+  outcol[0] = clamp(col[0], 0.0, 1.0);
+  outcol[1] = clamp(col[1], 0.0, 1.0);
+  outcol[2] = clamp(col[2], 0.0, 1.0);
+
+  return outcol;
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_geometry.osl b/intern/cycles/kernel/osl/shaders/node_geometry.osl
index 23d4c2ee66f..cc891abd6e3 100644
--- a/intern/cycles/kernel/osl/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/osl/shaders/node_geometry.osl
@@ -20,7 +20,7 @@ shader node_geometry(normal NormalIn = N,
   Normal = NormalIn;
   TrueNormal = Ng;
   Incoming = I;
-  Parametric = point(u, v, 0.0);
+  Parametric = point(1.0 - u - v, u, 0.0);
   Backfacing = backfacing();
 
   if (bump_offset == "dx") {
diff --git a/intern/cycles/kernel/osl/shaders/node_mix_color.osl b/intern/cycles/kernel/osl/shaders/node_mix_color.osl
new file mode 100644
index 00000000000..3ddd89ed306
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_mix_color.osl
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "node_color.h"
+#include "node_color_blend.h"
+#include "stdcycles.h"
+
+shader node_mix_color(string blend_type = "mix",
+                      int use_clamp = 0,
+                      int use_clamp_result = 0,
+                      float Factor = 0.5,
+                      color A = 0.0,
+                      color B = 0.0,
+                      output color Result = 0.0)
+{
+  float t = (use_clamp) ? clamp(Factor, 0.0, 1.0) : Factor;
+
+  if (blend_type == "mix")
+    Result = mix(A, B, t);
+  if (blend_type == "add")
+    Result = node_mix_add(t, A, B);
+  if (blend_type == "multiply")
+    Result = node_mix_mul(t, A, B);
+  if (blend_type == "screen")
+    Result = node_mix_screen(t, A, B);
+  if (blend_type == "overlay")
+    Result = node_mix_overlay(t, A, B);
+  if (blend_type == "subtract")
+    Result = node_mix_sub(t, A, B);
+  if (blend_type == "divide")
+    Result = node_mix_div(t, A, B);
+  if (blend_type == "difference")
+    Result = node_mix_diff(t, A, B);
+  if (blend_type == "darken")
+    Result = node_mix_dark(t, A, B);
+  if (blend_type == "lighten")
+    Result = node_mix_light(t, A, B);
+  if (blend_type == "dodge")
+    Result = node_mix_dodge(t, A, B);
+  if (blend_type == "burn")
+    Result = node_mix_burn(t, A, B);
+  if (blend_type == "hue")
+    Result = node_mix_hue(t, A, B);
+  if (blend_type == "saturation")
+    Result = node_mix_sat(t, A, B);
+  if (blend_type == "value")
+    Result = node_mix_val(t, A, B);
+  if (blend_type == "color")
+    Result = node_mix_color(t, A, B);
+  if (blend_type == "soft_light")
+    Result = node_mix_soft(t, A, B);
+  if (blend_type == "linear_light")
+    Result = node_mix_linear(t, A, B);
+
+  if (use_clamp_result)
+    Result = clamp(Result, 0.0, 1.0);
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_mix_float.osl b/intern/cycles/kernel/osl/shaders/node_mix_float.osl
new file mode 100644
index 00000000000..fdc7b4eff6e
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_mix_float.osl
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "stdcycles.h"
+
+shader node_mix_float(
+    int use_clamp = 0, float Factor = 0.5, float A = 0.0, float B = 0.0, output float Result = 0.0)
+{
+  float t = (use_clamp) ? clamp(Factor, 0.0, 1.0) : Factor;
+  Result = mix(A, B, t);
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_mix_vector.osl b/intern/cycles/kernel/osl/shaders/node_mix_vector.osl
new file mode 100644
index 00000000000..d76396afb0d
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_mix_vector.osl
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "stdcycles.h"
+
+shader node_mix_vector(int use_clamp = 0,
+                       float Factor = 0.5,
+                       vector A = 0.0,
+                       vector B = 0.0,
+                       output vector Result = 0.0)
+{
+  float t = (use_clamp) ? clamp(Factor, 0.0, 1.0) : Factor;
+  Result = mix(A, B, t);
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_mix_vector_non_uniform.osl b/intern/cycles/kernel/osl/shaders/node_mix_vector_non_uniform.osl
new file mode 100644
index 00000000000..217856bcf2a
--- /dev/null
+++ b/intern/cycles/kernel/osl/shaders/node_mix_vector_non_uniform.osl
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "stdcycles.h"
+
+shader node_mix_vector_non_uniform(int use_clamp = 0,
+                                   vector Factor = 0.5,
+                                   vector A = 0.0,
+                                   vector B = 0.0,
+                                   output vector Result = 0.0)
+{
+  vector t = (use_clamp) ? clamp(Factor, 0.0, 1.0) : Factor;
+  Result = mix(A, B, t);
+}
diff --git a/intern/cycles/kernel/osl/shaders/node_musgrave_texture.osl b/intern/cycles/kernel/osl/shaders/node_musgrave_texture.osl
index 391be8c14d7..fdda1ba9cd1 100644
--- a/intern/cycles/kernel/osl/shaders/node_musgrave_texture.osl
+++ b/intern/cycles/kernel/osl/shaders/node_musgrave_texture.osl
@@ -114,13 +114,12 @@ float noise_musgrave_hybrid_multi_fractal_1d(
 {
   float p = co;
   float pwHL = pow(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = safe_snoise(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0;
+  float value = 0.0;
+  float weight = 1.0;
 
-  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+  for (int i = 0; (weight > 0.001) && (i < (int)octaves); i++) {
     if (weight > 1.0) {
       weight = 1.0;
     }
@@ -133,8 +132,12 @@ float noise_musgrave_hybrid_multi_fractal_1d(
   }
 
   float rmd = octaves - floor(octaves);
-  if (rmd != 0.0) {
-    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  if ((rmd != 0.0) && (weight > 0.001)) {
+    if (weight > 1.0) {
+      weight = 1.0;
+    }
+    float signal = (safe_snoise(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
@@ -279,13 +282,12 @@ float noise_musgrave_hybrid_multi_fractal_2d(
 {
   vector2 p = co;
   float pwHL = pow(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = safe_snoise(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0;
+  float value = 0.0;
+  float weight = 1.0;
 
-  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+  for (int i = 0; (weight > 0.001) && (i < (int)octaves); i++) {
     if (weight > 1.0) {
       weight = 1.0;
     }
@@ -298,8 +300,12 @@ float noise_musgrave_hybrid_multi_fractal_2d(
   }
 
   float rmd = octaves - floor(octaves);
-  if (rmd != 0.0) {
-    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  if ((rmd != 0.0) && (weight > 0.001)) {
+    if (weight > 1.0) {
+      weight = 1.0;
+    }
+    float signal = (safe_snoise(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
@@ -444,13 +450,12 @@ float noise_musgrave_hybrid_multi_fractal_3d(
 {
   vector3 p = co;
   float pwHL = pow(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = safe_snoise(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0;
+  float value = 0.0;
+  float weight = 1.0;
 
-  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+  for (int i = 0; (weight > 0.001) && (i < (int)octaves); i++) {
     if (weight > 1.0) {
       weight = 1.0;
     }
@@ -463,8 +468,12 @@ float noise_musgrave_hybrid_multi_fractal_3d(
   }
 
   float rmd = octaves - floor(octaves);
-  if (rmd != 0.0) {
-    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  if ((rmd != 0.0) && (weight > 0.001)) {
+    if (weight > 1.0) {
+      weight = 1.0;
+    }
+    float signal = (safe_snoise(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
@@ -609,13 +618,12 @@ float noise_musgrave_hybrid_multi_fractal_4d(
 {
   vector4 p = co;
   float pwHL = pow(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = safe_snoise(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0;
+  float value = 0.0;
+  float weight = 1.0;
 
-  for (int i = 1; (weight > 0.001) && (i < (int)octaves); i++) {
+  for (int i = 0; (weight > 0.001) && (i < (int)octaves); i++) {
     if (weight > 1.0) {
       weight = 1.0;
     }
@@ -628,8 +636,12 @@ float noise_musgrave_hybrid_multi_fractal_4d(
   }
 
   float rmd = octaves - floor(octaves);
-  if (rmd != 0.0) {
-    value += rmd * ((safe_snoise(p) + offset) * pwr);
+  if ((rmd != 0.0) && (weight > 0.001)) {
+    if (weight > 1.0) {
+      weight = 1.0;
+    }
+    float signal = (safe_snoise(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
diff --git a/intern/cycles/kernel/osl/types.h b/intern/cycles/kernel/osl/types.h
new file mode 100644
index 00000000000..46e06114360
--- /dev/null
+++ b/intern/cycles/kernel/osl/types.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Closure */
+
+enum ClosureTypeOSL {
+  OSL_CLOSURE_MUL_ID = -1,
+  OSL_CLOSURE_ADD_ID = -2,
+
+  OSL_CLOSURE_NONE_ID = 0,
+
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) OSL_CLOSURE_##Upper##_ID,
+#include "closures_template.h"
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/sample/jitter.h b/intern/cycles/kernel/sample/jitter.h
index b8da94248a4..e748f95fc7d 100644
--- a/intern/cycles/kernel/sample/jitter.h
+++ b/intern/cycles/kernel/sample/jitter.h
@@ -1,182 +1,97 @@
 /* SPDX-License-Identifier: Apache-2.0
  * Copyright 2011-2022 Blender Foundation */
 
+#include "kernel/sample/util.h"
+#include "util/hash.h"
+
 #pragma once
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline uint32_t laine_karras_permutation(uint32_t x, uint32_t seed)
+ccl_device float pmj_sample_1D(KernelGlobals kg,
+                               uint sample,
+                               const uint rng_hash,
+                               const uint dimension)
 {
-  x += seed;
-  x ^= (x * 0x6c50b47cu);
-  x ^= x * 0xb82f1e52u;
-  x ^= x * 0xc7afe638u;
-  x ^= x * 0x8d22f6e6u;
+  uint seed = rng_hash;
 
-  return x;
-}
+  /* Use the same sample sequence seed for all pixels when using
+   * scrambling distance. */
+  if (kernel_data.integrator.scrambling_distance < 1.0f) {
+    seed = kernel_data.integrator.seed;
+  }
 
-ccl_device_inline uint32_t nested_uniform_scramble(uint32_t x, uint32_t seed)
-{
-  x = reverse_integer_bits(x);
-  x = laine_karras_permutation(x, seed);
-  x = reverse_integer_bits(x);
+  /* Shuffle the pattern order and sample index to better decorrelate
+   * dimensions and make the most of the finite patterns we have.
+   * The funky sample mask stuff is to ensure that we only shuffle
+   * *within* the current sample pattern, which is necessary to avoid
+   * early repeat pattern use. */
+  const uint pattern_i = hash_shuffle_uint(dimension, NUM_PMJ_PATTERNS, seed);
+  /* NUM_PMJ_SAMPLES should be a power of two, so this results in a mask. */
+  const uint sample_mask = NUM_PMJ_SAMPLES - 1;
+  const uint sample_shuffled = nested_uniform_scramble(sample,
+                                                       hash_wang_seeded_uint(dimension, seed));
+  sample = (sample & ~sample_mask) | (sample_shuffled & sample_mask);
+
+  /* Fetch the sample. */
+  const uint index = ((pattern_i * NUM_PMJ_SAMPLES) + sample) %
+                     (NUM_PMJ_SAMPLES * NUM_PMJ_PATTERNS);
+  float x = kernel_data_fetch(sample_pattern_lut, index * 2);
+
+  /* Do limited Cranley-Patterson rotation when using scrambling distance. */
+  if (kernel_data.integrator.scrambling_distance < 1.0f) {
+    const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) *
+                           kernel_data.integrator.scrambling_distance;
+    x += jitter_x;
+    x -= floorf(x);
+  }
 
   return x;
 }
 
-ccl_device_inline uint cmj_hash(uint i, uint p)
+ccl_device float2 pmj_sample_2D(KernelGlobals kg,
+                                uint sample,
+                                const uint rng_hash,
+                                const uint dimension)
 {
-  i ^= p;
-  i ^= i >> 17;
-  i ^= i >> 10;
-  i *= 0xb36534e5;
-  i ^= i >> 12;
-  i ^= i >> 21;
-  i *= 0x93fc4795;
-  i ^= 0xdf6e307f;
-  i ^= i >> 17;
-  i *= 1 | p >> 18;
-
-  return i;
-}
-
-ccl_device_inline uint cmj_hash_simple(uint i, uint p)
-{
-  i = (i ^ 61) ^ p;
-  i += i << 3;
-  i ^= i >> 4;
-  i *= 0x27d4eb2d;
-  return i;
-}
-
-ccl_device_inline float cmj_randfloat(uint i, uint p)
-{
-  return cmj_hash(i, p) * (1.0f / 4294967808.0f);
-}
-
-ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
-{
-  return cmj_hash_simple(i, p) * (1.0f / (float)0xFFFFFFFF);
-}
+  uint seed = rng_hash;
 
-ccl_device_inline float cmj_randfloat_simple_dist(uint i, uint p, float d)
-{
-  return cmj_hash_simple(i, p) * (d / (float)0xFFFFFFFF);
-}
-
-ccl_device float pmj_sample_1D(KernelGlobals kg, uint sample, uint rng_hash, uint dimension)
-{
-  uint hash = rng_hash;
-  float jitter_x = 0.0f;
+  /* Use the same sample sequence seed for all pixels when using
+   * scrambling distance. */
   if (kernel_data.integrator.scrambling_distance < 1.0f) {
-    hash = kernel_data.integrator.seed;
-
-    jitter_x = cmj_randfloat_simple_dist(
-        dimension, rng_hash, kernel_data.integrator.scrambling_distance);
+    seed = kernel_data.integrator.seed;
   }
 
-  /* Perform Owen shuffle of the sample number to reorder the samples. */
-#ifdef _SIMPLE_HASH_
-  const uint rv = cmj_hash_simple(dimension, hash);
-#else /* Use a _REGULAR_HASH_. */
-  const uint rv = cmj_hash(dimension, hash);
-#endif
-#ifdef _XOR_SHUFFLE_
-#  warning "Using XOR shuffle."
-  const uint s = sample ^ rv;
-#else /* Use _OWEN_SHUFFLE_ for reordering. */
-  const uint s = nested_uniform_scramble(sample, rv);
-#endif
-
-  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
-  const uint sample_set = s / NUM_PMJ_SAMPLES;
-  const uint d = (dimension + sample_set);
-  const uint dim = d % NUM_PMJ_PATTERNS;
-
-  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
-   *  the x part is used for even dims and the y for odd. */
-  int index = 2 * ((dim >> 1) * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)) + (dim & 1);
-
-  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
-
-#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
-  /* Use Cranley-Patterson rotation to displace the sample pattern. */
-#  ifdef _SIMPLE_HASH_
-  float dx = cmj_randfloat_simple(d, hash);
-#  else
-  float dx = cmj_randfloat(d, hash);
-#  endif
-  /* Jitter sample locations and map back into [0 1]. */
-  fx = fx + dx + jitter_x;
-  fx = fx - floorf(fx);
-#else
-#  warning "Not using Cranley-Patterson Rotation."
-#endif
-
-  return fx;
-}
-
-ccl_device void pmj_sample_2D(KernelGlobals kg,
-                              uint sample,
-                              uint rng_hash,
-                              uint dimension,
-                              ccl_private float *x,
-                              ccl_private float *y)
-{
-  uint hash = rng_hash;
-  float jitter_x = 0.0f;
-  float jitter_y = 0.0f;
+  /* Shuffle the pattern order and sample index to better decorrelate
+   * dimensions and make the most of the finite patterns we have.
+   * The funky sample mask stuff is to ensure that we only shuffle
+   * *within* the current sample pattern, which is necessary to avoid
+   * early repeat pattern use. */
+  const uint pattern_i = hash_shuffle_uint(dimension, NUM_PMJ_PATTERNS, seed);
+  /* NUM_PMJ_SAMPLES should be a power of two, so this results in a mask. */
+  const uint sample_mask = NUM_PMJ_SAMPLES - 1;
+  const uint sample_shuffled = nested_uniform_scramble(sample,
+                                                       hash_wang_seeded_uint(dimension, seed));
+  sample = (sample & ~sample_mask) | (sample_shuffled & sample_mask);
+
+  /* Fetch the sample. */
+  const uint index = ((pattern_i * NUM_PMJ_SAMPLES) + sample) %
+                     (NUM_PMJ_SAMPLES * NUM_PMJ_PATTERNS);
+  float x = kernel_data_fetch(sample_pattern_lut, index * 2);
+  float y = kernel_data_fetch(sample_pattern_lut, index * 2 + 1);
+
+  /* Do limited Cranley-Patterson rotation when using scrambling distance. */
   if (kernel_data.integrator.scrambling_distance < 1.0f) {
-    hash = kernel_data.integrator.seed;
-
-    jitter_x = cmj_randfloat_simple_dist(
-        dimension, rng_hash, kernel_data.integrator.scrambling_distance);
-    jitter_y = cmj_randfloat_simple_dist(
-        dimension + 1, rng_hash, kernel_data.integrator.scrambling_distance);
+    const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) *
+                           kernel_data.integrator.scrambling_distance;
+    const float jitter_y = hash_wang_seeded_float(dimension, rng_hash ^ 0xca0e1151) *
+                           kernel_data.integrator.scrambling_distance;
+    x += jitter_x;
+    y += jitter_y;
+    x -= floorf(x);
+    y -= floorf(y);
   }
 
-  /* Perform a shuffle on the sample number to reorder the samples. */
-#ifdef _SIMPLE_HASH_
-  const uint rv = cmj_hash_simple(dimension, hash);
-#else /* Use a _REGULAR_HASH_. */
-  const uint rv = cmj_hash(dimension, hash);
-#endif
-#ifdef _XOR_SHUFFLE_
-#  warning "Using XOR shuffle."
-  const uint s = sample ^ rv;
-#else /* Use _OWEN_SHUFFLE_ for reordering. */
-  const uint s = nested_uniform_scramble(sample, rv);
-#endif
-
-  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
-  const uint sample_set = s / NUM_PMJ_SAMPLES;
-  const uint d = dimension + sample_set;
-  uint dim = d % NUM_PMJ_PATTERNS;
-  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
-
-  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
-  float fy = kernel_tex_fetch(__sample_pattern_lut, index + 1);
-
-#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
-  /* Use Cranley-Patterson rotation to displace the sample pattern. */
-#  ifdef _SIMPLE_HASH_
-  float dx = cmj_randfloat_simple(d, hash);
-  float dy = cmj_randfloat_simple(d + 1, hash);
-#  else
-  float dx = cmj_randfloat(d, hash);
-  float dy = cmj_randfloat(d + 1, hash);
-#  endif
-  /* Jitter sample locations and map back to the unit square [0 1]x[0 1]. */
-  float sx = fx + dx + jitter_x;
-  float sy = fy + dy + jitter_y;
-  sx = sx - floorf(sx);
-  sy = sy - floorf(sy);
-#else
-#  warning "Not using Cranley Patterson Rotation."
-#endif
-
-  (*x) = sx;
-  (*y) = sy;
+  return make_float2(x, y);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/sample/pattern.h b/intern/cycles/kernel/sample/pattern.h
index 1e66f39ede2..ebdecc1bff9 100644
--- a/intern/cycles/kernel/sample/pattern.h
+++ b/intern/cycles/kernel/sample/pattern.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "kernel/sample/jitter.h"
+#include "kernel/sample/sobol_burley.h"
 #include "util/hash.h"
 
 CCL_NAMESPACE_BEGIN
@@ -12,33 +13,6 @@ CCL_NAMESPACE_BEGIN
  * this single threaded on a CPU for repeatable results. */
 //#define __DEBUG_CORRELATION__
 
-/* High Dimensional Sobol.
- *
- * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal
- * to classic Van der Corput and Sobol sequences. */
-
-#ifdef __SOBOL__
-
-/* Skip initial numbers that for some dimensions have clear patterns that
- * don't cover the entire sample space. Ideally we would have a better
- * progressive pattern that doesn't suffer from this problem, because even
- * with this offset some dimensions are quite poor.
- */
-#  define SOBOL_SKIP 64
-
-ccl_device uint sobol_dimension(KernelGlobals kg, int index, int dimension)
-{
-  uint result = 0;
-  uint i = index + SOBOL_SKIP;
-  for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
-    j += x;
-    result ^= __float_as_uint(kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1));
-  }
-  return result;
-}
-
-#endif /* __SOBOL__ */
-
 ccl_device_forceinline float path_rng_1D(KernelGlobals kg,
                                          uint rng_hash,
                                          int sample,
@@ -48,58 +22,29 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals kg,
   return (float)drand48();
 #endif
 
-#ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
-#endif
-  {
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_SOBOL_BURLEY) {
+    return sobol_burley_sample_1D(sample, dimension, rng_hash);
+  }
+  else {
     return pmj_sample_1D(kg, sample, rng_hash, dimension);
   }
-
-#ifdef __SOBOL__
-  /* Sobol sequence value using direction vectors. */
-  uint result = sobol_dimension(kg, sample, dimension);
-  float r = (float)result * (1.0f / (float)0xFFFFFFFF);
-
-  /* Cranly-Patterson rotation using rng seed */
-  float shift;
-
-  /* Hash rng with dimension to solve correlation issues.
-   * See T38710, T50116.
-   */
-  uint tmp_rng = cmj_hash_simple(dimension, rng_hash);
-  shift = tmp_rng * (kernel_data.integrator.scrambling_distance / (float)0xFFFFFFFF);
-
-  return r + shift - floorf(r + shift);
-#endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals kg,
-                                        uint rng_hash,
-                                        int sample,
-                                        int dimension,
-                                        ccl_private float *fx,
-                                        ccl_private float *fy)
+ccl_device_forceinline float2 path_rng_2D(KernelGlobals kg,
+                                          uint rng_hash,
+                                          int sample,
+                                          int dimension)
 {
 #ifdef __DEBUG_CORRELATION__
-  *fx = (float)drand48();
-  *fy = (float)drand48();
-  return;
-#endif
-
-#ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+  return make_float2((float)drand48(), (float)drand48());
 #endif
-  {
-    pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy);
 
-    return;
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_SOBOL_BURLEY) {
+    return sobol_burley_sample_2D(sample, dimension, rng_hash);
+  }
+  else {
+    return pmj_sample_2D(kg, sample, rng_hash, dimension);
   }
-
-#ifdef __SOBOL__
-  /* Sobol. */
-  *fx = path_rng_1D(kg, rng_hash, sample, dimension);
-  *fy = path_rng_1D(kg, rng_hash, sample, dimension + 1);
-#endif
 }
 
 /**
@@ -145,18 +90,33 @@ ccl_device_inline uint path_rng_hash_init(KernelGlobals kg,
   return rng_hash;
 }
 
-ccl_device_inline bool sample_is_even(int pattern, int sample)
+/**
+ * Splits samples into two different classes, A and B, which can be
+ * compared for variance estimation.
+ */
+ccl_device_inline bool sample_is_class_A(int pattern, int sample)
 {
-  if (pattern == SAMPLING_PATTERN_PMJ) {
-    /* See Section 10.2.1, "Progressive Multi-Jittered Sample Sequences", Christensen et al.
-     * We can use this to get divide sample sequence into two classes for easier variance
-     * estimation. */
-    return popcount(uint(sample) & 0xaaaaaaaa) & 1;
-  }
-  else {
-    /* TODO(Stefan): Are there reliable ways of dividing CMJ and Sobol into two classes? */
-    return sample & 0x1;
+#if 0
+  if (!(pattern == SAMPLING_PATTERN_PMJ || pattern == SAMPLING_PATTERN_SOBOL_BURLEY)) {
+    /* Fallback: assign samples randomly.
+     * This is guaranteed to work "okay" for any sampler, but isn't good.
+     * (Note: the seed constant is just a random number to guard against
+     * possible interactions with other uses of the hash. There's nothing
+     * special about it.)
+     */
+    return hash_hp_seeded_uint(sample, 0xa771f873) & 1;
   }
-}
+#else
+  (void)pattern;
+#endif
 
+  /* This follows the approach from section 10.2.1 of "Progressive
+   * Multi-Jittered Sample Sequences" by Christensen et al., but
+   * implemented with efficient bit-fiddling.
+   *
+   * This approach also turns out to work equally well with Sobol-Burley
+   * (see https://developer.blender.org/D15746#429471).
+   */
+  return popcount(uint(sample) & 0xaaaaaaaa) & 1;
+}
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/sample/sobol_burley.h b/intern/cycles/kernel/sample/sobol_burley.h
new file mode 100644
index 00000000000..47796ae7998
--- /dev/null
+++ b/intern/cycles/kernel/sample/sobol_burley.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+/*
+ * A shuffled, Owen-scrambled Sobol sampler, implemented with the
+ * techniques from the paper "Practical Hash-based Owen Scrambling"
+ * by Brent Burley, 2020, Journal of Computer Graphics Techniques.
+ *
+ * Note that unlike a standard high-dimensional Sobol sequence, this
+ * Sobol sampler uses padding to achieve higher dimensions, as described
+ * in Burley's paper.
+ */
+
+#pragma once
+
+#include "kernel/sample/util.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * Computes a single dimension of a sample from an Owen-scrambled
+ * Sobol sequence.  This is used in the main sampling functions,
+ * sobol_burley_sample_#D(), below.
+ *
+ * - rev_bit_index: the sample index, with reversed order bits.
+ * - dimension:     the sample dimension.
+ * - scramble_seed: the Owen scrambling seed.
+ *
+ * Note that the seed must be well randomized before being
+ * passed to this function.
+ */
+ccl_device_forceinline float sobol_burley(uint rev_bit_index,
+                                          const uint dimension,
+                                          const uint scramble_seed)
+{
+  uint result = 0;
+
+  if (dimension == 0) {
+    /* Fast-path for dimension 0, which is just Van der corput.
+     * This makes a notable difference in performance since we reuse
+     * dimensions for padding, and dimension 0 is reused the most. */
+    result = reverse_integer_bits(rev_bit_index);
+  }
+  else {
+    uint i = 0;
+    while (rev_bit_index != 0) {
+      uint j = count_leading_zeros(rev_bit_index);
+      result ^= sobol_burley_table[dimension][i + j];
+      i += j + 1;
+
+      /* We can't do "<<= j + 1" because that can overflow the shift
+       * operator, which doesn't do what we need on at least x86. */
+      rev_bit_index <<= j;
+      rev_bit_index <<= 1;
+    }
+  }
+
+  /* Apply Owen scrambling. */
+  result = reverse_integer_bits(reversed_bit_owen(result, scramble_seed));
+
+  return uint_to_float_excl(result);
+}
+
+/*
+ * Computes a 1D Owen-scrambled and shuffled Sobol sample.
+ */
+ccl_device float sobol_burley_sample_1D(uint index, uint const dimension, uint seed)
+{
+  /* Include the dimension in the seed, so we get decorrelated
+   * sequences for different dimensions via shuffling. */
+  seed ^= hash_hp_uint(dimension);
+
+  /* Shuffle. */
+  index = reversed_bit_owen(reverse_integer_bits(index), seed ^ 0xbff95bfe);
+
+  return sobol_burley(index, 0, seed ^ 0x635c77bd);
+}
+
+/*
+ * Computes a 2D Owen-scrambled and shuffled Sobol sample.
+ */
+ccl_device float2 sobol_burley_sample_2D(uint index, const uint dimension_set, uint seed)
+{
+  /* Include the dimension set in the seed, so we get decorrelated
+   * sequences for different dimension sets via shuffling. */
+  seed ^= hash_hp_uint(dimension_set);
+
+  /* Shuffle. */
+  index = reversed_bit_owen(reverse_integer_bits(index), seed ^ 0xf8ade99a);
+
+  return make_float2(sobol_burley(index, 0, seed ^ 0xe0aaaf76),
+                     sobol_burley(index, 1, seed ^ 0x94964d4e));
+}
+
+/*
+ * Computes a 3D Owen-scrambled and shuffled Sobol sample.
+ */
+ccl_device float3 sobol_burley_sample_3D(uint index, const uint dimension_set, uint seed)
+{
+  /* Include the dimension set in the seed, so we get decorrelated
+   * sequences for different dimension sets via shuffling. */
+  seed ^= hash_hp_uint(dimension_set);
+
+  /* Shuffle. */
+  index = reversed_bit_owen(reverse_integer_bits(index), seed ^ 0xcaa726ac);
+
+  return make_float3(sobol_burley(index, 0, seed ^ 0x9e78e391),
+                     sobol_burley(index, 1, seed ^ 0x67c33241),
+                     sobol_burley(index, 2, seed ^ 0x78c395c5));
+}
+
+/*
+ * Computes a 4D Owen-scrambled and shuffled Sobol sample.
+ */
+ccl_device float4 sobol_burley_sample_4D(uint index, const uint dimension_set, uint seed)
+{
+  /* Include the dimension set in the seed, so we get decorrelated
+   * sequences for different dimension sets via shuffling. */
+  seed ^= hash_hp_uint(dimension_set);
+
+  /* Shuffle. */
+  index = reversed_bit_owen(reverse_integer_bits(index), seed ^ 0xc2c1a055);
+
+  return make_float4(sobol_burley(index, 0, seed ^ 0x39468210),
+                     sobol_burley(index, 1, seed ^ 0xe9d8a845),
+                     sobol_burley(index, 2, seed ^ 0x5f32b482),
+                     sobol_burley(index, 3, seed ^ 0x1524cc56));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/sample/util.h b/intern/cycles/kernel/sample/util.h
new file mode 100644
index 00000000000..29cda179aa2
--- /dev/null
+++ b/intern/cycles/kernel/sample/util.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "util/types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * Performs base-2 Owen scrambling on a reversed-bit unsigned integer.
+ *
+ * This is equivalent to the Laine-Karras permutation, but much higher
+ * quality.  See https://psychopath.io/post/2021_01_30_building_a_better_lk_hash
+ */
+ccl_device_inline uint reversed_bit_owen(uint n, uint seed)
+{
+  n ^= n * 0x3d20adea;
+  n += seed;
+  n *= (seed >> 16) | 1;
+  n ^= n * 0x05526c56;
+  n ^= n * 0x53a22864;
+
+  return n;
+}
+
+/*
+ * Performs base-2 Owen scrambling on an unsigned integer.
+ */
+ccl_device_inline uint nested_uniform_scramble(uint i, uint seed)
+{
+  return reverse_integer_bits(reversed_bit_owen(reverse_integer_bits(i), seed));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/ao.h b/intern/cycles/kernel/svm/ao.h
index b477855dca3..70f52de789b 100644
--- a/intern/cycles/kernel/svm/ao.h
+++ b/intern/cycles/kernel/svm/ao.h
@@ -31,7 +31,7 @@ ccl_device float svm_ao(
     return 1.0f;
   }
 
-  /* Can't raytrace from shaders like displacement, before BVH exists. */
+  /* Can't ray-trace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
     return 1.0f;
   }
@@ -49,17 +49,18 @@ ccl_device float svm_ao(
 
   int unoccluded = 0;
   for (int sample = 0; sample < num_samples; sample++) {
-    float disk_u, disk_v;
-    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    const float2 rand_disk = path_branched_rng_2D(
+        kg, &rng_state, sample, num_samples, PRNG_SURFACE_AO);
 
-    float2 d = concentric_sample_disk(disk_u, disk_v);
+    float2 d = concentric_sample_disk(rand_disk.x, rand_disk.y);
     float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d)));
 
     /* Create ray. */
     Ray ray;
     ray.P = sd->P;
     ray.D = D.x * T + D.y * B + D.z * N;
-    ray.t = max_dist;
+    ray.tmin = 0.0f;
+    ray.tmax = max_dist;
     ray.time = sd->time;
     ray.self.object = sd->object;
     ray.self.prim = sd->prim;
diff --git a/intern/cycles/kernel/svm/aov.h b/intern/cycles/kernel/svm/aov.h
index 9b818f0e6f8..c574b28c078 100644
--- a/intern/cycles/kernel/svm/aov.h
+++ b/intern/cycles/kernel/svm/aov.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "kernel/film/write_passes.h"
+#include "kernel/film/aov_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -27,12 +27,7 @@ ccl_device void svm_node_aov_color(KernelGlobals kg,
   IF_KERNEL_NODES_FEATURE(AOV)
   {
     const float3 val = stack_load_float3(stack, node.y);
-    const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                          kernel_data.film.pass_stride;
-    ccl_global float *buffer = render_buffer + render_buffer_offset +
-                               (kernel_data.film.pass_aov_color + node.z);
-    kernel_write_pass_float4(buffer, make_float4(val.x, val.y, val.z, 1.0f));
+    film_write_aov_pass_color(kg, state, render_buffer, node.z, val);
   }
 }
 
@@ -47,12 +42,7 @@ ccl_device void svm_node_aov_value(KernelGlobals kg,
   IF_KERNEL_NODES_FEATURE(AOV)
   {
     const float val = stack_load_float(stack, node.y);
-    const uint32_t render_pixel_index = INTEGRATOR_STATE(state, path, render_pixel_index);
-    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
-                                          kernel_data.film.pass_stride;
-    ccl_global float *buffer = render_buffer + render_buffer_offset +
-                               (kernel_data.film.pass_aov_value + node.z);
-    kernel_write_pass_float(buffer, val);
+    film_write_aov_pass_value(kg, state, render_buffer, node.z, val);
   }
 }
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/attribute.h b/intern/cycles/kernel/svm/attribute.h
index a3609d8b4b0..5f0d1609f08 100644
--- a/intern/cycles/kernel/svm/attribute.h
+++ b/intern/cycles/kernel/svm/attribute.h
@@ -140,6 +140,16 @@ ccl_device_noinline void svm_node_attr(KernelGlobals kg,
   }
 }
 
+ccl_device_forceinline float3 svm_node_bump_P_dx(const ccl_private ShaderData *sd)
+{
+  return sd->P + differential_from_compact(sd->Ng, sd->dP).dx;
+}
+
+ccl_device_forceinline float3 svm_node_bump_P_dy(const ccl_private ShaderData *sd)
+{
+  return sd->P + differential_from_compact(sd->Ng, sd->dP).dy;
+}
+
 ccl_device_noinline void svm_node_attr_bump_dx(KernelGlobals kg,
                                                ccl_private ShaderData *sd,
                                                ccl_private float *stack,
@@ -167,7 +177,7 @@ ccl_device_noinline void svm_node_attr_bump_dx(KernelGlobals kg,
 
   if (node.y == ATTR_STD_GENERATED && desc.element == ATTR_ELEMENT_NONE) {
     /* No generated attribute, fall back to object coordinates. */
-    float3 f = sd->P + sd->dP.dx;
+    float3 f = svm_node_bump_P_dx(sd);
     if (sd->object != OBJECT_NONE) {
       object_inverse_position_transform(kg, sd, &f);
     }
@@ -265,7 +275,7 @@ ccl_device_noinline void svm_node_attr_bump_dy(KernelGlobals kg,
 
   if (node.y == ATTR_STD_GENERATED && desc.element == ATTR_ELEMENT_NONE) {
     /* No generated attribute, fall back to object coordinates. */
-    float3 f = sd->P + sd->dP.dy;
+    float3 f = svm_node_bump_P_dy(sd);
     if (sd->object != OBJECT_NONE) {
       object_inverse_position_transform(kg, sd, &f);
     }
diff --git a/intern/cycles/kernel/svm/bevel.h b/intern/cycles/kernel/svm/bevel.h
index 5abffe1c771..c1e227959f8 100644
--- a/intern/cycles/kernel/svm/bevel.h
+++ b/intern/cycles/kernel/svm/bevel.h
@@ -103,7 +103,7 @@ ccl_device float3 svm_bevel(
     return sd->N;
   }
 
-  /* Can't raytrace from shaders like displacement, before BVH exists. */
+  /* Can't ray-trace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
     return sd->N;
   }
@@ -128,8 +128,8 @@ ccl_device float3 svm_bevel(
   path_state_rng_load(state, &rng_state);
 
   for (int sample = 0; sample < num_samples; sample++) {
-    float disk_u, disk_v;
-    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    float2 rand_disk = path_branched_rng_2D(
+        kg, &rng_state, sample, num_samples, PRNG_SURFACE_BEVEL);
 
     /* Pick random axis in local frame and point on disk. */
     float3 disk_N, disk_T, disk_B;
@@ -138,13 +138,13 @@ ccl_device float3 svm_bevel(
     disk_N = sd->Ng;
     make_orthonormals(disk_N, &disk_T, &disk_B);
 
-    float axisu = disk_u;
+    float axisu = rand_disk.x;
 
     if (axisu < 0.5f) {
       pick_pdf_N = 0.5f;
       pick_pdf_T = 0.25f;
       pick_pdf_B = 0.25f;
-      disk_u *= 2.0f;
+      rand_disk.x *= 2.0f;
     }
     else if (axisu < 0.75f) {
       float3 tmp = disk_N;
@@ -153,7 +153,7 @@ ccl_device float3 svm_bevel(
       pick_pdf_N = 0.25f;
       pick_pdf_T = 0.5f;
       pick_pdf_B = 0.25f;
-      disk_u = (disk_u - 0.5f) * 4.0f;
+      rand_disk.x = (rand_disk.x - 0.5f) * 4.0f;
     }
     else {
       float3 tmp = disk_N;
@@ -162,12 +162,12 @@ ccl_device float3 svm_bevel(
       pick_pdf_N = 0.25f;
       pick_pdf_T = 0.25f;
       pick_pdf_B = 0.5f;
-      disk_u = (disk_u - 0.75f) * 4.0f;
+      rand_disk.x = (rand_disk.x - 0.75f) * 4.0f;
     }
 
     /* Sample point on disk. */
-    float phi = M_2PI_F * disk_u;
-    float disk_r = disk_v;
+    float phi = M_2PI_F * rand_disk.x;
+    float disk_r = rand_disk.y;
     float disk_height;
 
     /* Perhaps find something better than Cubic BSSRDF, but happens to work well. */
@@ -179,7 +179,8 @@ ccl_device float3 svm_bevel(
     Ray ray ccl_optional_struct_init;
     ray.P = sd->P + disk_N * disk_height + disk_P;
     ray.D = -disk_N;
-    ray.t = 2.0f * disk_height;
+    ray.tmin = 0.0f;
+    ray.tmax = 2.0f * disk_height;
     ray.dP = differential_zero_compact();
     ray.dD = differential_zero_compact();
     ray.time = sd->time;
@@ -222,7 +223,7 @@ ccl_device float3 svm_bevel(
       /* Get geometric normal. */
       float3 hit_Ng = isect.Ng[hit];
       int object = isect.hits[hit].object;
-      int object_flag = kernel_tex_fetch(__object_flag, object);
+      int object_flag = kernel_data_fetch(object_flag, object);
       if (object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
         hit_Ng = -hit_Ng;
       }
@@ -230,7 +231,7 @@ ccl_device float3 svm_bevel(
       /* Compute smooth normal. */
       float3 N = hit_Ng;
       int prim = isect.hits[hit].prim;
-      int shader = kernel_tex_fetch(__tri_shader, prim);
+      int shader = kernel_data_fetch(tri_shader, prim);
 
       if (shader & SHADER_SMOOTH_NORMAL) {
         float u = isect.hits[hit].u;
diff --git a/intern/cycles/kernel/svm/bump.h b/intern/cycles/kernel/svm/bump.h
index 566c45f5f25..1009a6a4241 100644
--- a/intern/cycles/kernel/svm/bump.h
+++ b/intern/cycles/kernel/svm/bump.h
@@ -14,23 +14,21 @@ ccl_device_noinline void svm_node_enter_bump_eval(KernelGlobals kg,
 {
   /* save state */
   stack_store_float3(stack, offset + 0, sd->P);
-  stack_store_float3(stack, offset + 3, sd->dP.dx);
-  stack_store_float3(stack, offset + 6, sd->dP.dy);
+  stack_store_float(stack, offset + 3, sd->dP);
 
   /* set state as if undisplaced */
   const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED);
 
   if (desc.offset != ATTR_STD_NOT_FOUND) {
-    float3 P, dPdx, dPdy;
-    P = primitive_surface_attribute_float3(kg, sd, desc, &dPdx, &dPdy);
+    differential3 dP;
+    float3 P = primitive_surface_attribute_float3(kg, sd, desc, &dP.dx, &dP.dy);
 
     object_position_transform(kg, sd, &P);
-    object_dir_transform(kg, sd, &dPdx);
-    object_dir_transform(kg, sd, &dPdy);
+    object_dir_transform(kg, sd, &dP.dx);
+    object_dir_transform(kg, sd, &dP.dy);
 
     sd->P = P;
-    sd->dP.dx = dPdx;
-    sd->dP.dy = dPdy;
+    sd->dP = differential_make_compact(dP);
   }
 }
 
@@ -41,8 +39,7 @@ ccl_device_noinline void svm_node_leave_bump_eval(KernelGlobals kg,
 {
   /* restore state */
   sd->P = stack_load_float3(stack, offset + 0);
-  sd->dP.dx = stack_load_float3(stack, offset + 3);
-  sd->dP.dy = stack_load_float3(stack, offset + 6);
+  sd->dP = stack_load_float(stack, offset + 3);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/closure.h b/intern/cycles/kernel/svm/closure.h
index 305bd404d27..2d91b014f60 100644
--- a/intern/cycles/kernel/svm/closure.h
+++ b/intern/cycles/kernel/svm/closure.h
@@ -3,6 +3,13 @@
 
 #pragma once
 
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/emissive.h"
+
+#include "kernel/util/color.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Closure Nodes */
@@ -104,7 +111,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
                                                 __uint_as_float(node.w);
 
   switch (type) {
-#ifdef __PRINCIPLED__
     case CLOSURE_BSDF_PRINCIPLED_ID: {
       uint specular_offset, roughness_offset, specular_tint_offset, anisotropic_offset,
           sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset,
@@ -183,7 +189,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       }
       float3 subsurface_radius = stack_valid(data_cn_ssr.y) ?
                                      stack_load_float3(stack, data_cn_ssr.y) :
-                                     make_float3(1.0f, 1.0f, 1.0f);
+                                     one_float3();
       float subsurface_ior = stack_valid(data_cn_ssr.z) ? stack_load_float(stack, data_cn_ssr.z) :
                                                           1.4f;
       float subsurface_anisotropy = stack_valid(data_cn_ssr.w) ?
@@ -198,12 +204,12 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
                                                 __uint_as_float(data_subsurface_color.z),
                                                 __uint_as_float(data_subsurface_color.w));
 
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
 
-#  ifdef __SUBSURFACE__
+#ifdef __SUBSURFACE__
       float3 mixed_ss_base_color = subsurface_color * subsurface +
                                    base_color * (1.0f - subsurface);
-      float3 subsurf_weight = weight * mixed_ss_base_color * diffuse_weight;
+      Spectrum subsurf_weight = weight * rgb_to_spectrum(mixed_ss_base_color) * diffuse_weight;
 
       /* disable in case of diffuse ancestor, can't see it well then and
        * adds considerably noise due to probabilities of continuing path
@@ -220,7 +226,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       /* diffuse */
       if (fabsf(average(mixed_ss_base_color)) > CLOSURE_WEIGHT_CUTOFF) {
         if (subsurface <= CLOSURE_WEIGHT_CUTOFF && diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
-          float3 diff_weight = weight * base_color * diffuse_weight;
+          Spectrum diff_weight = weight * rgb_to_spectrum(base_color) * diffuse_weight;
 
           ccl_private PrincipledDiffuseBsdf *bsdf = (ccl_private PrincipledDiffuseBsdf *)
               bsdf_alloc(sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
@@ -237,8 +243,8 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
           ccl_private Bssrdf *bssrdf = bssrdf_alloc(sd, subsurf_weight);
 
           if (bssrdf) {
-            bssrdf->radius = subsurface_radius * subsurface;
-            bssrdf->albedo = mixed_ss_base_color;
+            bssrdf->radius = rgb_to_spectrum(subsurface_radius * subsurface);
+            bssrdf->albedo = rgb_to_spectrum(mixed_ss_base_color);
             bssrdf->N = N;
             bssrdf->roughness = roughness;
 
@@ -251,10 +257,10 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
           }
         }
       }
-#  else
+#else
       /* diffuse */
       if (diffuse_weight > CLOSURE_WEIGHT_CUTOFF) {
-        float3 diff_weight = weight * base_color * diffuse_weight;
+        Spectrum diff_weight = weight * rgb_to_spectrum(base_color) * diffuse_weight;
 
         ccl_private PrincipledDiffuseBsdf *bsdf = (ccl_private PrincipledDiffuseBsdf *)bsdf_alloc(
             sd, sizeof(PrincipledDiffuseBsdf), diff_weight);
@@ -267,20 +273,18 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
           sd->flag |= bsdf_principled_diffuse_setup(bsdf, PRINCIPLED_DIFFUSE_FULL);
         }
       }
-#  endif
+#endif
 
       /* sheen */
       if (diffuse_weight > CLOSURE_WEIGHT_CUTOFF && sheen > CLOSURE_WEIGHT_CUTOFF) {
         float m_cdlum = linear_rgb_to_gray(kg, base_color);
-        float3 m_ctint = m_cdlum > 0.0f ?
-                             base_color / m_cdlum :
-                             make_float3(1.0f, 1.0f, 1.0f);  // normalize lum. to isolate hue+sat
+        float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum :
+                                          one_float3();  // normalize lum. to isolate hue+sat
 
         /* color of the sheen component */
-        float3 sheen_color = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - sheen_tint) +
-                             m_ctint * sheen_tint;
+        float3 sheen_color = make_float3(1.0f - sheen_tint) + m_ctint * sheen_tint;
 
-        float3 sheen_weight = weight * sheen * sheen_color * diffuse_weight;
+        Spectrum sheen_weight = weight * sheen * rgb_to_spectrum(sheen_color) * diffuse_weight;
 
         ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)bsdf_alloc(
             sd, sizeof(PrincipledSheenBsdf), sheen_weight);
@@ -294,12 +298,12 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       }
 
       /* specular reflection */
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
       if (kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
-#  endif
+#endif
         if (specular_weight > CLOSURE_WEIGHT_CUTOFF &&
             (specular > CLOSURE_WEIGHT_CUTOFF || metallic > CLOSURE_WEIGHT_CUTOFF)) {
-          float3 spec_weight = weight * specular_weight;
+          Spectrum spec_weight = weight * specular_weight;
 
           ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
               sd, sizeof(MicrofacetBsdf), spec_weight);
@@ -322,16 +326,13 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
 
             float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y +
                             0.1f * base_color.z;  // luminance approx.
-            float3 m_ctint = m_cdlum > 0.0f ?
-                                 base_color / m_cdlum :
-                                 make_float3(
-                                     1.0f, 1.0f, 1.0f);  // normalize lum. to isolate hue+sat
-            float3 tmp_col = make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint) +
-                             m_ctint * specular_tint;
-
-            bsdf->extra->cspec0 = (specular * 0.08f * tmp_col) * (1.0f - metallic) +
-                                  base_color * metallic;
-            bsdf->extra->color = base_color;
+            float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum :
+                                              one_float3();  // normalize lum. to isolate hue+sat
+            float3 tmp_col = make_float3(1.0f - specular_tint) + m_ctint * specular_tint;
+
+            bsdf->extra->cspec0 = rgb_to_spectrum(
+                (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic);
+            bsdf->extra->color = rgb_to_spectrum(base_color);
             bsdf->extra->clearcoat = 0.0f;
 
             /* setup bsdf */
@@ -342,28 +343,27 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
               sd->flag |= bsdf_microfacet_multi_ggx_fresnel_setup(bsdf, sd);
           }
         }
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
       }
-#  endif
+#endif
 
       /* BSDF */
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
       if (kernel_data.integrator.caustics_reflective ||
           kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0) {
-#  endif
+#endif
         if (final_transmission > CLOSURE_WEIGHT_CUTOFF) {
-          float3 glass_weight = weight * final_transmission;
-          float3 cspec0 = base_color * specular_tint +
-                          make_float3(1.0f, 1.0f, 1.0f) * (1.0f - specular_tint);
+          Spectrum glass_weight = weight * final_transmission;
+          float3 cspec0 = base_color * specular_tint + make_float3(1.0f - specular_tint);
 
           if (roughness <= 5e-2f ||
               distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID) { /* use single-scatter GGX */
             float refl_roughness = roughness;
 
             /* reflection */
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
             if (kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0)
-#  endif
+#endif
             {
               ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
                   sd, sizeof(MicrofacetBsdf), glass_weight * fresnel);
@@ -374,15 +374,15 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
 
               if (bsdf && extra) {
                 bsdf->N = N;
-                bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+                bsdf->T = zero_float3();
                 bsdf->extra = extra;
 
                 bsdf->alpha_x = refl_roughness * refl_roughness;
                 bsdf->alpha_y = refl_roughness * refl_roughness;
                 bsdf->ior = ior;
 
-                bsdf->extra->color = base_color;
-                bsdf->extra->cspec0 = cspec0;
+                bsdf->extra->color = rgb_to_spectrum(base_color);
+                bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
                 bsdf->extra->clearcoat = 0.0f;
 
                 /* setup bsdf */
@@ -391,17 +391,19 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
             }
 
             /* refraction */
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
             if (kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
-#  endif
+#endif
             {
-              /* This is to prevent mnee from receiving a null bsdf. */
+              /* This is to prevent MNEE from receiving a null BSDF. */
               float refraction_fresnel = fmaxf(0.0001f, 1.0f - fresnel);
               ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
-                  sd, sizeof(MicrofacetBsdf), base_color * glass_weight * refraction_fresnel);
+                  sd,
+                  sizeof(MicrofacetBsdf),
+                  rgb_to_spectrum(base_color) * glass_weight * refraction_fresnel);
               if (bsdf) {
                 bsdf->N = N;
-                bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+                bsdf->T = zero_float3();
                 bsdf->extra = NULL;
 
                 if (distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID)
@@ -430,14 +432,14 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
             if (bsdf && extra) {
               bsdf->N = N;
               bsdf->extra = extra;
-              bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+              bsdf->T = zero_float3();
 
               bsdf->alpha_x = roughness * roughness;
               bsdf->alpha_y = roughness * roughness;
               bsdf->ior = ior;
 
-              bsdf->extra->color = base_color;
-              bsdf->extra->cspec0 = cspec0;
+              bsdf->extra->color = rgb_to_spectrum(base_color);
+              bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
               bsdf->extra->clearcoat = 0.0f;
 
               /* setup bsdf */
@@ -445,14 +447,14 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
             }
           }
         }
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
       }
-#  endif
+#endif
 
       /* clearcoat */
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
       if (kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
-#  endif
+#endif
         if (clearcoat > CLOSURE_WEIGHT_CUTOFF) {
           ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
               sd, sizeof(MicrofacetBsdf), weight);
@@ -463,30 +465,29 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
 
           if (bsdf && extra) {
             bsdf->N = clearcoat_normal;
-            bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+            bsdf->T = zero_float3();
             bsdf->ior = 1.5f;
             bsdf->extra = extra;
 
             bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
             bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
 
-            bsdf->extra->color = make_float3(0.0f, 0.0f, 0.0f);
-            bsdf->extra->cspec0 = make_float3(0.04f, 0.04f, 0.04f);
+            bsdf->extra->color = zero_spectrum();
+            bsdf->extra->cspec0 = make_spectrum(0.04f);
             bsdf->extra->clearcoat = clearcoat;
 
             /* setup bsdf */
             sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
           }
         }
-#  ifdef __CAUSTICS_TRICKS__
+#ifdef __CAUSTICS_TRICKS__
       }
-#  endif
+#endif
 
       break;
     }
-#endif /* __PRINCIPLED__ */
     case CLOSURE_BSDF_DIFFUSE_ID: {
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private OrenNayarBsdf *bsdf = (ccl_private OrenNayarBsdf *)bsdf_alloc(
           sd, sizeof(OrenNayarBsdf), weight);
 
@@ -506,7 +507,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       break;
     }
     case CLOSURE_BSDF_TRANSLUCENT_ID: {
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private DiffuseBsdf *bsdf = (ccl_private DiffuseBsdf *)bsdf_alloc(
           sd, sizeof(DiffuseBsdf), weight);
 
@@ -517,7 +518,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       break;
     }
     case CLOSURE_BSDF_TRANSPARENT_ID: {
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       bsdf_transparent_setup(sd, weight, path_flag);
       break;
     }
@@ -530,7 +531,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       if (!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
         break;
 #endif
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
           sd, sizeof(MicrofacetBsdf), weight);
 
@@ -545,7 +546,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       bsdf->extra = NULL;
 
       if (data_node.y == SVM_STACK_INVALID) {
-        bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+        bsdf->T = zero_float3();
         bsdf->alpha_x = roughness;
         bsdf->alpha_y = roughness;
       }
@@ -581,8 +582,8 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
         bsdf->extra = (ccl_private MicrofacetExtra *)closure_alloc_extra(sd,
                                                                          sizeof(MicrofacetExtra));
         if (bsdf->extra) {
-          bsdf->extra->color = stack_load_float3(stack, data_node.w);
-          bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
+          bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.w));
+          bsdf->extra->cspec0 = zero_spectrum();
           bsdf->extra->clearcoat = 0.0f;
           sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
         }
@@ -600,13 +601,13 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       if (!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
         break;
 #endif
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
           sd, sizeof(MicrofacetBsdf), weight);
 
       if (bsdf) {
         bsdf->N = N;
-        bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+        bsdf->T = zero_float3();
         bsdf->extra = NULL;
 
         float eta = fmaxf(param2, 1e-5f);
@@ -644,7 +645,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
         break;
       }
 #endif
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
 
       /* index of refraction */
       float eta = fmaxf(param2, 1e-5f);
@@ -665,7 +666,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
 
         if (bsdf) {
           bsdf->N = N;
-          bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+          bsdf->T = zero_float3();
           bsdf->extra = NULL;
           svm_node_glass_setup(sd, bsdf, type, eta, roughness, false);
         }
@@ -676,14 +677,14 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       if (kernel_data.integrator.caustics_refractive || (path_flag & PATH_RAY_DIFFUSE) == 0)
 #endif
       {
-        /* This is to prevent mnee from receiving a null bsdf. */
+        /* This is to prevent MNEE from receiving a null BSDF. */
         float refraction_fresnel = fmaxf(0.0001f, 1.0f - fresnel);
         ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
             sd, sizeof(MicrofacetBsdf), weight * refraction_fresnel);
 
         if (bsdf) {
           bsdf->N = N;
-          bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+          bsdf->T = zero_float3();
           bsdf->extra = NULL;
           svm_node_glass_setup(sd, bsdf, type, eta, roughness, true);
         }
@@ -697,7 +698,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
           !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
         break;
 #endif
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
           sd, sizeof(MicrofacetBsdf), weight);
       if (!bsdf) {
@@ -712,7 +713,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
 
       bsdf->N = N;
       bsdf->extra = extra;
-      bsdf->T = make_float3(0.0f, 0.0f, 0.0f);
+      bsdf->T = zero_float3();
 
       float roughness = sqr(param1);
       bsdf->alpha_x = roughness;
@@ -721,8 +722,8 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       bsdf->ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
 
       kernel_assert(stack_valid(data_node.z));
-      bsdf->extra->color = stack_load_float3(stack, data_node.z);
-      bsdf->extra->cspec0 = make_float3(0.0f, 0.0f, 0.0f);
+      bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.z));
+      bsdf->extra->cspec0 = zero_spectrum();
       bsdf->extra->clearcoat = 0.0f;
 
       /* setup bsdf */
@@ -730,7 +731,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       break;
     }
     case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private VelvetBsdf *bsdf = (ccl_private VelvetBsdf *)bsdf_alloc(
           sd, sizeof(VelvetBsdf), weight);
 
@@ -749,7 +750,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       ATTR_FALLTHROUGH;
 #endif
     case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private ToonBsdf *bsdf = (ccl_private ToonBsdf *)bsdf_alloc(
           sd, sizeof(ToonBsdf), weight);
 
@@ -771,7 +772,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
       uint4 data_node3 = read_node(kg, &offset);
       uint4 data_node4 = read_node(kg, &offset);
 
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
 
       uint offset_ofs, ior_ofs, color_ofs, parametrization;
       svm_unpack_node_uchar4(data_node.y, &offset_ofs, &ior_ofs, &color_ofs, &parametrization);
@@ -829,7 +830,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
         switch (parametrization) {
           case NODE_PRINCIPLED_HAIR_DIRECT_ABSORPTION: {
             float3 absorption_coefficient = stack_load_float3(stack, absorption_coefficient_ofs);
-            bsdf->sigma = absorption_coefficient;
+            bsdf->sigma = rgb_to_spectrum(absorption_coefficient);
             break;
           }
           case NODE_PRINCIPLED_HAIR_PIGMENT_CONCENTRATION: {
@@ -849,20 +850,21 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
             /* Benedikt Bitterli's melanin ratio remapping. */
             float eumelanin = melanin * (1.0f - melanin_redness);
             float pheomelanin = melanin * melanin_redness;
-            float3 melanin_sigma = bsdf_principled_hair_sigma_from_concentration(eumelanin,
-                                                                                 pheomelanin);
+            Spectrum melanin_sigma = bsdf_principled_hair_sigma_from_concentration(eumelanin,
+                                                                                   pheomelanin);
 
             /* Optional tint. */
             float3 tint = stack_load_float3(stack, tint_ofs);
-            float3 tint_sigma = bsdf_principled_hair_sigma_from_reflectance(tint,
-                                                                            radial_roughness);
+            Spectrum tint_sigma = bsdf_principled_hair_sigma_from_reflectance(
+                rgb_to_spectrum(tint), radial_roughness);
 
             bsdf->sigma = melanin_sigma + tint_sigma;
             break;
           }
           case NODE_PRINCIPLED_HAIR_REFLECTANCE: {
             float3 color = stack_load_float3(stack, color_ofs);
-            bsdf->sigma = bsdf_principled_hair_sigma_from_reflectance(color, radial_roughness);
+            bsdf->sigma = bsdf_principled_hair_sigma_from_reflectance(rgb_to_spectrum(color),
+                                                                      radial_roughness);
             break;
           }
           default: {
@@ -879,7 +881,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
     }
     case CLOSURE_BSDF_HAIR_REFLECTION_ID:
     case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
 
       ccl_private HairBsdf *bsdf = (ccl_private HairBsdf *)bsdf_alloc(
           sd, sizeof(HairBsdf), weight);
@@ -916,7 +918,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
     case CLOSURE_BSSRDF_BURLEY_ID:
     case CLOSURE_BSSRDF_RANDOM_WALK_ID:
     case CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID: {
-      float3 weight = sd->svm_closure_weight * mix_weight;
+      Spectrum weight = sd->svm_closure_weight * mix_weight;
       ccl_private Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
 
       if (bssrdf) {
@@ -926,7 +928,7 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
         if (path_flag & PATH_RAY_DIFFUSE_ANCESTOR)
           param1 = 0.0f;
 
-        bssrdf->radius = stack_load_float3(stack, data_node.z) * param1;
+        bssrdf->radius = rgb_to_spectrum(stack_load_float3(stack, data_node.z) * param1);
         bssrdf->albedo = sd->svm_closure_weight;
         bssrdf->N = N;
         bssrdf->roughness = FLT_MAX;
@@ -976,10 +978,10 @@ ccl_device_noinline void svm_node_closure_volume(KernelGlobals kg,
   density = mix_weight * fmaxf(density, 0.0f);
 
   /* Compute scattering coefficient. */
-  float3 weight = sd->svm_closure_weight;
+  Spectrum weight = sd->svm_closure_weight;
 
   if (type == CLOSURE_VOLUME_ABSORPTION_ID) {
-    weight = make_float3(1.0f, 1.0f, 1.0f) - weight;
+    weight = one_spectrum() - weight;
   }
 
   weight *= density;
@@ -1047,11 +1049,11 @@ ccl_device_noinline int svm_node_principled_volume(KernelGlobals kg,
 
   if (density > CLOSURE_WEIGHT_CUTOFF) {
     /* Compute scattering color. */
-    float3 color = sd->svm_closure_weight;
+    Spectrum color = sd->svm_closure_weight;
 
     const AttributeDescriptor attr_color = find_attribute(kg, sd, attr_node.y);
     if (attr_color.offset != ATTR_STD_NOT_FOUND) {
-      color *= primitive_volume_attribute_float3(kg, sd, attr_color);
+      color *= rgb_to_spectrum(primitive_volume_attribute_float3(kg, sd, attr_color));
     }
 
     /* Add closure for volume scattering. */
@@ -1066,10 +1068,13 @@ ccl_device_noinline int svm_node_principled_volume(KernelGlobals kg,
     }
 
     /* Add extinction weight. */
-    float3 zero = make_float3(0.0f, 0.0f, 0.0f);
-    float3 one = make_float3(1.0f, 1.0f, 1.0f);
-    float3 absorption_color = max(sqrt(stack_load_float3(stack, absorption_color_offset)), zero);
-    float3 absorption = max(one - color, zero) * max(one - absorption_color, zero);
+    float3 absorption_color = max(sqrt(stack_load_float3(stack, absorption_color_offset)),
+                                  zero_float3());
+
+    Spectrum zero = zero_spectrum();
+    Spectrum one = one_spectrum();
+    Spectrum absorption = max(one - color, zero) *
+                          max(one - rgb_to_spectrum(absorption_color), zero);
     volume_extinction_setup(sd, (color + absorption) * density);
   }
 
@@ -1089,7 +1094,7 @@ ccl_device_noinline int svm_node_principled_volume(KernelGlobals kg,
 
   if (emission > CLOSURE_WEIGHT_CUTOFF) {
     float3 emission_color = stack_load_float3(stack, emission_color_offset);
-    emission_setup(sd, emission * emission_color);
+    emission_setup(sd, rgb_to_spectrum(emission * emission_color));
   }
 
   if (blackbody > CLOSURE_WEIGHT_CUTOFF) {
@@ -1113,7 +1118,7 @@ ccl_device_noinline int svm_node_principled_volume(KernelGlobals kg,
       float3 blackbody_tint = stack_load_float3(stack, node.w);
       float3 bb = blackbody_tint * intensity *
                   rec709_to_rgb(kg, svm_math_blackbody_color_rec709(T));
-      emission_setup(sd, bb);
+      emission_setup(sd, rgb_to_spectrum(bb));
     }
   }
 #endif
@@ -1125,7 +1130,7 @@ ccl_device_noinline void svm_node_closure_emission(ccl_private ShaderData *sd,
                                                    uint4 node)
 {
   uint mix_weight_offset = node.y;
-  float3 weight = sd->svm_closure_weight;
+  Spectrum weight = sd->svm_closure_weight;
 
   if (stack_valid(mix_weight_offset)) {
     float mix_weight = stack_load_float(stack, mix_weight_offset);
@@ -1144,7 +1149,7 @@ ccl_device_noinline void svm_node_closure_background(ccl_private ShaderData *sd,
                                                      uint4 node)
 {
   uint mix_weight_offset = node.y;
-  float3 weight = sd->svm_closure_weight;
+  Spectrum weight = sd->svm_closure_weight;
 
   if (stack_valid(mix_weight_offset)) {
     float mix_weight = stack_load_float(stack, mix_weight_offset);
@@ -1181,14 +1186,15 @@ ccl_device_noinline void svm_node_closure_holdout(ccl_private ShaderData *sd,
 
 /* Closure Nodes */
 
-ccl_device_inline void svm_node_closure_store_weight(ccl_private ShaderData *sd, float3 weight)
+ccl_device_inline void svm_node_closure_store_weight(ccl_private ShaderData *sd, Spectrum weight)
 {
   sd->svm_closure_weight = weight;
 }
 
 ccl_device void svm_node_closure_set_weight(ccl_private ShaderData *sd, uint r, uint g, uint b)
 {
-  float3 weight = make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b));
+  Spectrum weight = rgb_to_spectrum(
+      make_float3(__uint_as_float(r), __uint_as_float(g), __uint_as_float(b)));
   svm_node_closure_store_weight(sd, weight);
 }
 
@@ -1196,7 +1202,7 @@ ccl_device void svm_node_closure_weight(ccl_private ShaderData *sd,
                                         ccl_private float *stack,
                                         uint weight_offset)
 {
-  float3 weight = stack_load_float3(stack, weight_offset);
+  Spectrum weight = rgb_to_spectrum(stack_load_float3(stack, weight_offset));
   svm_node_closure_store_weight(sd, weight);
 }
 
@@ -1209,7 +1215,7 @@ ccl_device_noinline void svm_node_emission_weight(KernelGlobals kg,
   uint strength_offset = node.z;
 
   float strength = stack_load_float(stack, strength_offset);
-  float3 weight = stack_load_float3(stack, color_offset) * strength;
+  Spectrum weight = rgb_to_spectrum(stack_load_float3(stack, color_offset)) * strength;
 
   svm_node_closure_store_weight(sd, weight);
 }
diff --git a/intern/cycles/kernel/svm/color_util.h b/intern/cycles/kernel/svm/color_util.h
index fa22d4bc8c2..96adb6fd64c 100644
--- a/intern/cycles/kernel/svm/color_util.h
+++ b/intern/cycles/kernel/svm/color_util.h
@@ -244,13 +244,11 @@ ccl_device float3 svm_mix_linear(float t, float3 col1, float3 col2)
 
 ccl_device float3 svm_mix_clamp(float3 col)
 {
-  return saturate3(col);
+  return saturate(col);
 }
 
-ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float3 c2)
+ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float t, float3 c1, float3 c2)
 {
-  float t = saturatef(fac);
-
   switch (type) {
     case NODE_MIX_BLEND:
       return svm_mix_blend(t, c1, c2);
@@ -282,7 +280,7 @@ ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float
       return svm_mix_sat(t, c1, c2);
     case NODE_MIX_VAL:
       return svm_mix_val(t, c1, c2);
-    case NODE_MIX_COLOR:
+    case NODE_MIX_COL:
       return svm_mix_color(t, c1, c2);
     case NODE_MIX_SOFT:
       return svm_mix_soft(t, c1, c2);
@@ -295,6 +293,12 @@ ccl_device_noinline_cpu float3 svm_mix(NodeMix type, float fac, float3 c1, float
   return make_float3(0.0f, 0.0f, 0.0f);
 }
 
+ccl_device_noinline_cpu float3 svm_mix_clamped_factor(NodeMix type, float t, float3 c1, float3 c2)
+{
+  float fac = saturatef(t);
+  return svm_mix(type, fac, c1, c2);
+}
+
 ccl_device_inline float3 svm_brightness_contrast(float3 color, float brightness, float contrast)
 {
   float a = 1.0f + contrast;
diff --git a/intern/cycles/kernel/svm/displace.h b/intern/cycles/kernel/svm/displace.h
index 128023263fd..230f8c73820 100644
--- a/intern/cycles/kernel/svm/displace.h
+++ b/intern/cycles/kernel/svm/displace.h
@@ -24,18 +24,17 @@ ccl_device_noinline void svm_node_set_bump(KernelGlobals kg,
     float3 normal_in = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) :
                                                     sd->N;
 
-    float3 dPdx = sd->dP.dx;
-    float3 dPdy = sd->dP.dy;
+    differential3 dP = differential_from_compact(sd->Ng, sd->dP);
 
     if (use_object_space) {
       object_inverse_normal_transform(kg, sd, &normal_in);
-      object_inverse_dir_transform(kg, sd, &dPdx);
-      object_inverse_dir_transform(kg, sd, &dPdy);
+      object_inverse_dir_transform(kg, sd, &dP.dx);
+      object_inverse_dir_transform(kg, sd, &dP.dy);
     }
 
     /* get surface tangents from normal */
-    float3 Rx = cross(dPdy, normal_in);
-    float3 Ry = cross(normal_in, dPdx);
+    float3 Rx = cross(dP.dy, normal_in);
+    float3 Ry = cross(normal_in, dP.dx);
 
     /* get bump values */
     uint c_offset, x_offset, y_offset, strength_offset;
@@ -46,7 +45,7 @@ ccl_device_noinline void svm_node_set_bump(KernelGlobals kg,
     float h_y = stack_load_float(stack, y_offset);
 
     /* compute surface gradient and determinant */
-    float det = dot(dPdx, Rx);
+    float det = dot(dP.dx, Rx);
     float3 surfgrad = (h_x - h_c) * Rx + (h_y - h_c) * Ry;
 
     float absdet = fabsf(det);
diff --git a/intern/cycles/kernel/svm/geometry.h b/intern/cycles/kernel/svm/geometry.h
index 4b5368dd765..cbd87d84409 100644
--- a/intern/cycles/kernel/svm/geometry.h
+++ b/intern/cycles/kernel/svm/geometry.h
@@ -34,7 +34,7 @@ ccl_device_noinline void svm_node_geometry(KernelGlobals kg,
       data = sd->Ng;
       break;
     case NODE_GEOM_uv:
-      data = make_float3(sd->u, sd->v, 0.0f);
+      data = make_float3(1.0f - sd->u - sd->v, sd->u, 0.0f);
       break;
     default:
       data = make_float3(0.0f, 0.0f, 0.0f);
@@ -54,10 +54,10 @@ ccl_device_noinline void svm_node_geometry_bump_dx(KernelGlobals kg,
 
   switch (type) {
     case NODE_GEOM_P:
-      data = sd->P + sd->dP.dx;
+      data = svm_node_bump_P_dx(sd);
       break;
     case NODE_GEOM_uv:
-      data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f);
+      data = make_float3(1.0f - sd->u - sd->du.dx - sd->v - sd->dv.dx, sd->u + sd->du.dx, 0.0f);
       break;
     default:
       svm_node_geometry(kg, sd, stack, type, out_offset);
@@ -81,10 +81,10 @@ ccl_device_noinline void svm_node_geometry_bump_dy(KernelGlobals kg,
 
   switch (type) {
     case NODE_GEOM_P:
-      data = sd->P + sd->dP.dy;
+      data = svm_node_bump_P_dy(sd);
       break;
     case NODE_GEOM_uv:
-      data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f);
+      data = make_float3(1.0f - sd->u - sd->du.dy - sd->v - sd->dv.dy, sd->u + sd->du.dy, 0.0f);
       break;
     default:
       svm_node_geometry(kg, sd, stack, type, out_offset);
diff --git a/intern/cycles/kernel/svm/ies.h b/intern/cycles/kernel/svm/ies.h
index 201d88101cd..3648cb580d5 100644
--- a/intern/cycles/kernel/svm/ies.h
+++ b/intern/cycles/kernel/svm/ies.h
@@ -17,7 +17,7 @@ ccl_device_inline float interpolate_ies_vertical(
    * Therefore, the assumption is made that the light is going to be symmetrical, which means that
    * we can just take the corresponding value at the current horizontal coordinate. */
 
-#define IES_LOOKUP(v) kernel_tex_fetch(__ies, ofs + h * v_num + (v))
+#define IES_LOOKUP(v) kernel_data_fetch(ies, ofs + h * v_num + (v))
   /* If v is zero, assume symmetry and read at v=1 instead of v=-1. */
   float a = IES_LOOKUP((v == 0) ? 1 : v - 1);
   float b = IES_LOOKUP(v);
@@ -31,16 +31,16 @@ ccl_device_inline float interpolate_ies_vertical(
 ccl_device_inline float kernel_ies_interp(KernelGlobals kg, int slot, float h_angle, float v_angle)
 {
   /* Find offset of the IES data in the table. */
-  int ofs = __float_as_int(kernel_tex_fetch(__ies, slot));
+  int ofs = __float_as_int(kernel_data_fetch(ies, slot));
   if (ofs == -1) {
     return 100.0f;
   }
 
-  int h_num = __float_as_int(kernel_tex_fetch(__ies, ofs++));
-  int v_num = __float_as_int(kernel_tex_fetch(__ies, ofs++));
+  int h_num = __float_as_int(kernel_data_fetch(ies, ofs++));
+  int v_num = __float_as_int(kernel_data_fetch(ies, ofs++));
 
-#define IES_LOOKUP_ANGLE_H(h) kernel_tex_fetch(__ies, ofs + (h))
-#define IES_LOOKUP_ANGLE_V(v) kernel_tex_fetch(__ies, ofs + h_num + (v))
+#define IES_LOOKUP_ANGLE_H(h) kernel_data_fetch(ies, ofs + (h))
+#define IES_LOOKUP_ANGLE_V(v) kernel_data_fetch(ies, ofs + h_num + (v))
 
   /* Check whether the angle is within the bounds of the IES texture. */
   if (v_angle >= IES_LOOKUP_ANGLE_V(v_num - 1)) {
diff --git a/intern/cycles/kernel/svm/map_range.h b/intern/cycles/kernel/svm/map_range.h
index ff0e462041c..ea85bc43b74 100644
--- a/intern/cycles/kernel/svm/map_range.h
+++ b/intern/cycles/kernel/svm/map_range.h
@@ -112,10 +112,10 @@ ccl_device_noinline int svm_node_vector_map_range(KernelGlobals kg,
   switch (range_type_stack_offset) {
     default:
     case NODE_MAP_RANGE_LINEAR:
-      factor = safe_divide_float3_float3((value - from_min), (from_max - from_min));
+      factor = safe_divide((value - from_min), (from_max - from_min));
       break;
     case NODE_MAP_RANGE_STEPPED: {
-      factor = safe_divide_float3_float3((value - from_min), (from_max - from_min));
+      factor = safe_divide((value - from_min), (from_max - from_min));
       factor = make_float3((steps.x > 0.0f) ? floorf(factor.x * (steps.x + 1.0f)) / steps.x : 0.0f,
                            (steps.y > 0.0f) ? floorf(factor.y * (steps.y + 1.0f)) / steps.y : 0.0f,
                            (steps.z > 0.0f) ? floorf(factor.z * (steps.z + 1.0f)) / steps.z :
@@ -123,13 +123,13 @@ ccl_device_noinline int svm_node_vector_map_range(KernelGlobals kg,
       break;
     }
     case NODE_MAP_RANGE_SMOOTHSTEP: {
-      factor = safe_divide_float3_float3((value - from_min), (from_max - from_min));
+      factor = safe_divide((value - from_min), (from_max - from_min));
       factor = clamp(factor, zero_float3(), one_float3());
       factor = (make_float3(3.0f, 3.0f, 3.0f) - 2.0f * factor) * (factor * factor);
       break;
     }
     case NODE_MAP_RANGE_SMOOTHERSTEP: {
-      factor = safe_divide_float3_float3((value - from_min), (from_max - from_min));
+      factor = safe_divide((value - from_min), (from_max - from_min));
       factor = clamp(factor, zero_float3(), one_float3());
       factor = factor * factor * factor * (factor * (factor * 6.0f - 15.0f) + 10.0f);
       break;
diff --git a/intern/cycles/kernel/svm/mapping_util.h b/intern/cycles/kernel/svm/mapping_util.h
index c616d4018c4..13257c762e7 100644
--- a/intern/cycles/kernel/svm/mapping_util.h
+++ b/intern/cycles/kernel/svm/mapping_util.h
@@ -13,13 +13,12 @@ svm_mapping(NodeMappingType type, float3 vector, float3 location, float3 rotatio
     case NODE_MAPPING_TYPE_POINT:
       return transform_direction(&rotationTransform, (vector * scale)) + location;
     case NODE_MAPPING_TYPE_TEXTURE:
-      return safe_divide_float3_float3(
-          transform_direction_transposed(&rotationTransform, (vector - location)), scale);
+      return safe_divide(transform_direction_transposed(&rotationTransform, (vector - location)),
+                         scale);
     case NODE_MAPPING_TYPE_VECTOR:
       return transform_direction(&rotationTransform, (vector * scale));
     case NODE_MAPPING_TYPE_NORMAL:
-      return safe_normalize(
-          transform_direction(&rotationTransform, safe_divide_float3_float3(vector, scale)));
+      return safe_normalize(transform_direction(&rotationTransform, safe_divide(vector, scale)));
     default:
       return make_float3(0.0f, 0.0f, 0.0f);
   }
diff --git a/intern/cycles/kernel/svm/math_util.h b/intern/cycles/kernel/svm/math_util.h
index 89bd4a501a7..d90d4f0f794 100644
--- a/intern/cycles/kernel/svm/math_util.h
+++ b/intern/cycles/kernel/svm/math_util.h
@@ -24,7 +24,7 @@ ccl_device void svm_vector_math(ccl_private float *value,
       *vector = a * b;
       break;
     case NODE_VECTOR_MATH_DIVIDE:
-      *vector = safe_divide_float3_float3(a, b);
+      *vector = safe_divide(a, b);
       break;
     case NODE_VECTOR_MATH_CROSS_PRODUCT:
       *vector = cross(a, b);
@@ -60,7 +60,7 @@ ccl_device void svm_vector_math(ccl_private float *value,
       *vector = safe_normalize(a);
       break;
     case NODE_VECTOR_MATH_SNAP:
-      *vector = floor(safe_divide_float3_float3(a, b)) * b;
+      *vector = floor(safe_divide(a, b)) * b;
       break;
     case NODE_VECTOR_MATH_FLOOR:
       *vector = floor(a);
diff --git a/intern/cycles/kernel/svm/mix.h b/intern/cycles/kernel/svm/mix.h
index a9796096410..ead2fc44685 100644
--- a/intern/cycles/kernel/svm/mix.h
+++ b/intern/cycles/kernel/svm/mix.h
@@ -21,10 +21,94 @@ ccl_device_noinline int svm_node_mix(KernelGlobals kg,
   float fac = stack_load_float(stack, fac_offset);
   float3 c1 = stack_load_float3(stack, c1_offset);
   float3 c2 = stack_load_float3(stack, c2_offset);
-  float3 result = svm_mix((NodeMix)node1.y, fac, c1, c2);
+  float3 result = svm_mix_clamped_factor((NodeMix)node1.y, fac, c1, c2);
 
   stack_store_float3(stack, node1.z, result);
   return offset;
 }
 
+ccl_device_noinline void svm_node_mix_color(ccl_private ShaderData *sd,
+                                            ccl_private float *stack,
+                                            uint options,
+                                            uint input_offset,
+                                            uint result_offset)
+{
+  uint use_clamp, blend_type, use_clamp_result;
+  uint fac_in_stack_offset, a_in_stack_offset, b_in_stack_offset;
+  svm_unpack_node_uchar3(options, &use_clamp, &blend_type, &use_clamp_result);
+  svm_unpack_node_uchar3(
+      input_offset, &fac_in_stack_offset, &a_in_stack_offset, &b_in_stack_offset);
+
+  float t = stack_load_float(stack, fac_in_stack_offset);
+  if (use_clamp > 0) {
+    t = saturatef(t);
+  }
+  float3 a = stack_load_float3(stack, a_in_stack_offset);
+  float3 b = stack_load_float3(stack, b_in_stack_offset);
+  float3 result = svm_mix((NodeMix)blend_type, t, a, b);
+  if (use_clamp_result) {
+    result = saturate(result);
+  }
+  stack_store_float3(stack, result_offset, result);
+}
+
+ccl_device_noinline void svm_node_mix_float(ccl_private ShaderData *sd,
+                                            ccl_private float *stack,
+                                            uint use_clamp,
+                                            uint input_offset,
+                                            uint result_offset)
+{
+  uint fac_in_stack_offset, a_in_stack_offset, b_in_stack_offset;
+  svm_unpack_node_uchar3(
+      input_offset, &fac_in_stack_offset, &a_in_stack_offset, &b_in_stack_offset);
+
+  float t = stack_load_float(stack, fac_in_stack_offset);
+  if (use_clamp > 0) {
+    t = saturatef(t);
+  }
+  float a = stack_load_float(stack, a_in_stack_offset);
+  float b = stack_load_float(stack, b_in_stack_offset);
+  float result = a * (1 - t) + b * t;
+
+  stack_store_float(stack, result_offset, result);
+}
+
+ccl_device_noinline void svm_node_mix_vector(ccl_private ShaderData *sd,
+                                             ccl_private float *stack,
+                                             uint input_offset,
+                                             uint result_offset)
+{
+  uint use_clamp, fac_in_stack_offset, a_in_stack_offset, b_in_stack_offset;
+  svm_unpack_node_uchar4(
+      input_offset, &use_clamp, &fac_in_stack_offset, &a_in_stack_offset, &b_in_stack_offset);
+
+  float t = stack_load_float(stack, fac_in_stack_offset);
+  if (use_clamp > 0) {
+    t = saturatef(t);
+  }
+  float3 a = stack_load_float3(stack, a_in_stack_offset);
+  float3 b = stack_load_float3(stack, b_in_stack_offset);
+  float3 result = a * (one_float3() - t) + b * t;
+  stack_store_float3(stack, result_offset, result);
+}
+
+ccl_device_noinline void svm_node_mix_vector_non_uniform(ccl_private ShaderData *sd,
+                                                         ccl_private float *stack,
+                                                         uint input_offset,
+                                                         uint result_offset)
+{
+  uint use_clamp, fac_in_stack_offset, a_in_stack_offset, b_in_stack_offset;
+  svm_unpack_node_uchar4(
+      input_offset, &use_clamp, &fac_in_stack_offset, &a_in_stack_offset, &b_in_stack_offset);
+
+  float3 t = stack_load_float3(stack, fac_in_stack_offset);
+  if (use_clamp > 0) {
+    t = saturate(t);
+  }
+  float3 a = stack_load_float3(stack, a_in_stack_offset);
+  float3 b = stack_load_float3(stack, b_in_stack_offset);
+  float3 result = a * (one_float3() - t) + b * t;
+  stack_store_float3(stack, result_offset, result);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/musgrave.h b/intern/cycles/kernel/svm/musgrave.h
index 521c96d9f37..8bf172f0981 100644
--- a/intern/cycles/kernel/svm/musgrave.h
+++ b/intern/cycles/kernel/svm/musgrave.h
@@ -119,13 +119,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_1d(
 {
   float p = co;
   float pwHL = powf(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = snoise_1d(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0f;
+  float value = 0.0f;
+  float weight = 1.0f;
 
-  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+  for (int i = 0; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
     if (weight > 1.0f) {
       weight = 1.0f;
     }
@@ -138,8 +137,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_1d(
   }
 
   float rmd = octaves - floorf(octaves);
-  if (rmd != 0.0f) {
-    value += rmd * ((snoise_1d(p) + offset) * pwr);
+  if ((rmd != 0.0f) && (weight > 0.001f)) {
+    if (weight > 1.0f) {
+      weight = 1.0f;
+    }
+    float signal = (snoise_1d(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
@@ -290,13 +293,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_2d(
 {
   float2 p = co;
   float pwHL = powf(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = snoise_2d(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0f;
+  float value = 0.0f;
+  float weight = 1.0f;
 
-  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+  for (int i = 0; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
     if (weight > 1.0f) {
       weight = 1.0f;
     }
@@ -309,8 +311,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_2d(
   }
 
   float rmd = octaves - floorf(octaves);
-  if (rmd != 0.0f) {
-    value += rmd * ((snoise_2d(p) + offset) * pwr);
+  if ((rmd != 0.0f) && (weight > 0.001f)) {
+    if (weight > 1.0f) {
+      weight = 1.0f;
+    }
+    float signal = (snoise_2d(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
@@ -461,13 +467,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_3d(
 {
   float3 p = co;
   float pwHL = powf(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = snoise_3d(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0f;
+  float value = 0.0f;
+  float weight = 1.0f;
 
-  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+  for (int i = 0; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
     if (weight > 1.0f) {
       weight = 1.0f;
     }
@@ -480,8 +485,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_3d(
   }
 
   float rmd = octaves - floorf(octaves);
-  if (rmd != 0.0f) {
-    value += rmd * ((snoise_3d(p) + offset) * pwr);
+  if ((rmd != 0.0f) && (weight > 0.001f)) {
+    if (weight > 1.0f) {
+      weight = 1.0f;
+    }
+    float signal = (snoise_3d(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
@@ -632,13 +641,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_4d(
 {
   float4 p = co;
   float pwHL = powf(lacunarity, -H);
-  float pwr = pwHL;
 
-  float value = snoise_4d(p) + offset;
-  float weight = gain * value;
-  p *= lacunarity;
+  float pwr = 1.0f;
+  float value = 0.0f;
+  float weight = 1.0f;
 
-  for (int i = 1; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
+  for (int i = 0; (weight > 0.001f) && (i < float_to_int(octaves)); i++) {
     if (weight > 1.0f) {
       weight = 1.0f;
     }
@@ -651,8 +659,12 @@ ccl_device_noinline_cpu float noise_musgrave_hybrid_multi_fractal_4d(
   }
 
   float rmd = octaves - floorf(octaves);
-  if (rmd != 0.0f) {
-    value += rmd * ((snoise_4d(p) + offset) * pwr);
+  if ((rmd != 0.0f) && (weight > 0.001f)) {
+    if (weight > 1.0f) {
+      weight = 1.0f;
+    }
+    float signal = (snoise_4d(p) + offset) * pwr;
+    value += rmd * weight * signal;
   }
 
   return value;
diff --git a/intern/cycles/kernel/svm/node_types_template.h b/intern/cycles/kernel/svm/node_types_template.h
new file mode 100644
index 00000000000..aab9b9f1158
--- /dev/null
+++ b/intern/cycles/kernel/svm/node_types_template.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef SHADER_NODE_TYPE
+#  define SHADER_NODE_TYPE(name)
+#endif
+
+/* NOTE: for best OpenCL performance, item definition in the enum must
+ * match the switch case order in `svm.h`. */
+
+SHADER_NODE_TYPE(NODE_END)
+SHADER_NODE_TYPE(NODE_SHADER_JUMP)
+SHADER_NODE_TYPE(NODE_CLOSURE_BSDF)
+SHADER_NODE_TYPE(NODE_CLOSURE_EMISSION)
+SHADER_NODE_TYPE(NODE_CLOSURE_BACKGROUND)
+SHADER_NODE_TYPE(NODE_CLOSURE_SET_WEIGHT)
+SHADER_NODE_TYPE(NODE_CLOSURE_WEIGHT)
+SHADER_NODE_TYPE(NODE_EMISSION_WEIGHT)
+SHADER_NODE_TYPE(NODE_MIX_CLOSURE)
+SHADER_NODE_TYPE(NODE_JUMP_IF_ZERO)
+SHADER_NODE_TYPE(NODE_JUMP_IF_ONE)
+SHADER_NODE_TYPE(NODE_GEOMETRY)
+SHADER_NODE_TYPE(NODE_CONVERT)
+SHADER_NODE_TYPE(NODE_TEX_COORD)
+SHADER_NODE_TYPE(NODE_VALUE_F)
+SHADER_NODE_TYPE(NODE_VALUE_V)
+SHADER_NODE_TYPE(NODE_ATTR)
+SHADER_NODE_TYPE(NODE_VERTEX_COLOR)
+SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DX)
+SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DY)
+SHADER_NODE_TYPE(NODE_SET_DISPLACEMENT)
+SHADER_NODE_TYPE(NODE_DISPLACEMENT)
+SHADER_NODE_TYPE(NODE_VECTOR_DISPLACEMENT)
+SHADER_NODE_TYPE(NODE_TEX_IMAGE)
+SHADER_NODE_TYPE(NODE_TEX_IMAGE_BOX)
+SHADER_NODE_TYPE(NODE_TEX_NOISE)
+SHADER_NODE_TYPE(NODE_SET_BUMP)
+SHADER_NODE_TYPE(NODE_ATTR_BUMP_DX)
+SHADER_NODE_TYPE(NODE_ATTR_BUMP_DY)
+SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DX)
+SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DY)
+SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DX)
+SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DY)
+SHADER_NODE_TYPE(NODE_CLOSURE_SET_NORMAL)
+SHADER_NODE_TYPE(NODE_ENTER_BUMP_EVAL)
+SHADER_NODE_TYPE(NODE_LEAVE_BUMP_EVAL)
+SHADER_NODE_TYPE(NODE_HSV)
+SHADER_NODE_TYPE(NODE_CLOSURE_HOLDOUT)
+SHADER_NODE_TYPE(NODE_FRESNEL)
+SHADER_NODE_TYPE(NODE_LAYER_WEIGHT)
+SHADER_NODE_TYPE(NODE_CLOSURE_VOLUME)
+SHADER_NODE_TYPE(NODE_PRINCIPLED_VOLUME)
+SHADER_NODE_TYPE(NODE_MATH)
+SHADER_NODE_TYPE(NODE_VECTOR_MATH)
+SHADER_NODE_TYPE(NODE_RGB_RAMP)
+SHADER_NODE_TYPE(NODE_GAMMA)
+SHADER_NODE_TYPE(NODE_BRIGHTCONTRAST)
+SHADER_NODE_TYPE(NODE_LIGHT_PATH)
+SHADER_NODE_TYPE(NODE_OBJECT_INFO)
+SHADER_NODE_TYPE(NODE_PARTICLE_INFO)
+SHADER_NODE_TYPE(NODE_HAIR_INFO)
+SHADER_NODE_TYPE(NODE_POINT_INFO)
+SHADER_NODE_TYPE(NODE_TEXTURE_MAPPING)
+SHADER_NODE_TYPE(NODE_MAPPING)
+SHADER_NODE_TYPE(NODE_MIN_MAX)
+SHADER_NODE_TYPE(NODE_CAMERA)
+SHADER_NODE_TYPE(NODE_TEX_ENVIRONMENT)
+SHADER_NODE_TYPE(NODE_TEX_SKY)
+SHADER_NODE_TYPE(NODE_TEX_GRADIENT)
+SHADER_NODE_TYPE(NODE_TEX_VORONOI)
+SHADER_NODE_TYPE(NODE_TEX_MUSGRAVE)
+SHADER_NODE_TYPE(NODE_TEX_WAVE)
+SHADER_NODE_TYPE(NODE_TEX_MAGIC)
+SHADER_NODE_TYPE(NODE_TEX_CHECKER)
+SHADER_NODE_TYPE(NODE_TEX_BRICK)
+SHADER_NODE_TYPE(NODE_TEX_WHITE_NOISE)
+SHADER_NODE_TYPE(NODE_NORMAL)
+SHADER_NODE_TYPE(NODE_LIGHT_FALLOFF)
+SHADER_NODE_TYPE(NODE_IES)
+SHADER_NODE_TYPE(NODE_CURVES)
+SHADER_NODE_TYPE(NODE_TANGENT)
+SHADER_NODE_TYPE(NODE_NORMAL_MAP)
+SHADER_NODE_TYPE(NODE_INVERT)
+SHADER_NODE_TYPE(NODE_MIX)
+SHADER_NODE_TYPE(NODE_SEPARATE_COLOR)
+SHADER_NODE_TYPE(NODE_COMBINE_COLOR)
+SHADER_NODE_TYPE(NODE_SEPARATE_VECTOR)
+SHADER_NODE_TYPE(NODE_COMBINE_VECTOR)
+SHADER_NODE_TYPE(NODE_SEPARATE_HSV)
+SHADER_NODE_TYPE(NODE_COMBINE_HSV)
+SHADER_NODE_TYPE(NODE_VECTOR_ROTATE)
+SHADER_NODE_TYPE(NODE_VECTOR_TRANSFORM)
+SHADER_NODE_TYPE(NODE_WIREFRAME)
+SHADER_NODE_TYPE(NODE_WAVELENGTH)
+SHADER_NODE_TYPE(NODE_BLACKBODY)
+SHADER_NODE_TYPE(NODE_MAP_RANGE)
+SHADER_NODE_TYPE(NODE_VECTOR_MAP_RANGE)
+SHADER_NODE_TYPE(NODE_CLAMP)
+SHADER_NODE_TYPE(NODE_BEVEL)
+SHADER_NODE_TYPE(NODE_AMBIENT_OCCLUSION)
+SHADER_NODE_TYPE(NODE_TEX_VOXEL)
+SHADER_NODE_TYPE(NODE_AOV_START)
+SHADER_NODE_TYPE(NODE_AOV_COLOR)
+SHADER_NODE_TYPE(NODE_AOV_VALUE)
+SHADER_NODE_TYPE(NODE_FLOAT_CURVE)
+SHADER_NODE_TYPE(NODE_MIX_COLOR)
+SHADER_NODE_TYPE(NODE_MIX_FLOAT)
+SHADER_NODE_TYPE(NODE_MIX_VECTOR)
+SHADER_NODE_TYPE(NODE_MIX_VECTOR_NON_UNIFORM)
+
+/* Padding for struct alignment. */
+SHADER_NODE_TYPE(NODE_PAD1)
+
+#undef SHADER_NODE_TYPE
diff --git a/intern/cycles/kernel/svm/ramp.h b/intern/cycles/kernel/svm/ramp.h
index 342b15da9ed..0df9268bd9c 100644
--- a/intern/cycles/kernel/svm/ramp.h
+++ b/intern/cycles/kernel/svm/ramp.h
@@ -9,7 +9,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline float fetch_float(KernelGlobals kg, int offset)
 {
-  uint4 node = kernel_tex_fetch(__svm_nodes, offset);
+  uint4 node = kernel_data_fetch(svm_nodes, offset);
   return __uint_as_float(node.x);
 }
 
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 5def943c87f..3ca632c5f0b 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -95,14 +95,14 @@ ccl_device_inline bool stack_valid(uint a)
 
 ccl_device_inline uint4 read_node(KernelGlobals kg, ccl_private int *offset)
 {
-  uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
+  uint4 node = kernel_data_fetch(svm_nodes, *offset);
   (*offset)++;
   return node;
 }
 
 ccl_device_inline float4 read_node_float(KernelGlobals kg, ccl_private int *offset)
 {
-  uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
+  uint4 node = kernel_data_fetch(svm_nodes, *offset);
   float4 f = make_float4(__uint_as_float(node.x),
                          __uint_as_float(node.y),
                          __uint_as_float(node.z),
@@ -113,7 +113,7 @@ ccl_device_inline float4 read_node_float(KernelGlobals kg, ccl_private int *offs
 
 ccl_device_inline float4 fetch_node_float(KernelGlobals kg, int offset)
 {
-  uint4 node = kernel_tex_fetch(__svm_nodes, offset);
+  uint4 node = kernel_data_fetch(svm_nodes, offset);
   return make_float4(__uint_as_float(node.x),
                      __uint_as_float(node.y),
                      __uint_as_float(node.z),
@@ -204,6 +204,15 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+#  define SVM_CASE(node) \
+    case node: \
+      if (!kernel_data_svm_usage_##node) \
+        break;
+#else
+#  define SVM_CASE(node) case node:
+#endif
+
 /* Main Interpreter Loop */
 template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
 ccl_device void svm_eval_nodes(KernelGlobals kg,
@@ -219,9 +228,10 @@ ccl_device void svm_eval_nodes(KernelGlobals kg,
     uint4 node = read_node(kg, &offset);
 
     switch (node.x) {
-      case NODE_END:
-        return;
-      case NODE_SHADER_JUMP: {
+      SVM_CASE(NODE_END)
+      return;
+      SVM_CASE(NODE_SHADER_JUMP)
+      {
         if (type == SHADER_TYPE_SURFACE)
           offset = node.y;
         else if (type == SHADER_TYPE_VOLUME)
@@ -232,351 +242,361 @@ ccl_device void svm_eval_nodes(KernelGlobals kg,
           return;
         break;
       }
-      case NODE_CLOSURE_BSDF:
-        offset = svm_node_closure_bsdf<node_feature_mask, type>(
-            kg, sd, stack, node, path_flag, offset);
-        break;
-      case NODE_CLOSURE_EMISSION:
-        IF_KERNEL_NODES_FEATURE(EMISSION)
-        {
-          svm_node_closure_emission(sd, stack, node);
-        }
-        break;
-      case NODE_CLOSURE_BACKGROUND:
-        IF_KERNEL_NODES_FEATURE(EMISSION)
-        {
-          svm_node_closure_background(sd, stack, node);
-        }
-        break;
-      case NODE_CLOSURE_SET_WEIGHT:
-        svm_node_closure_set_weight(sd, node.y, node.z, node.w);
-        break;
-      case NODE_CLOSURE_WEIGHT:
-        svm_node_closure_weight(sd, stack, node.y);
-        break;
-      case NODE_EMISSION_WEIGHT:
-        IF_KERNEL_NODES_FEATURE(EMISSION)
-        {
-          svm_node_emission_weight(kg, sd, stack, node);
-        }
-        break;
-      case NODE_MIX_CLOSURE:
-        svm_node_mix_closure(sd, stack, node);
-        break;
-      case NODE_JUMP_IF_ZERO:
-        if (stack_load_float(stack, node.z) == 0.0f)
-          offset += node.y;
-        break;
-      case NODE_JUMP_IF_ONE:
-        if (stack_load_float(stack, node.z) == 1.0f)
-          offset += node.y;
-        break;
-      case NODE_GEOMETRY:
-        svm_node_geometry(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_CONVERT:
-        svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_TEX_COORD:
-        offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
-        break;
-      case NODE_VALUE_F:
-        svm_node_value_f(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_VALUE_V:
-        offset = svm_node_value_v(kg, sd, stack, node.y, offset);
-        break;
-      case NODE_ATTR:
-        svm_node_attr<node_feature_mask>(kg, sd, stack, node);
-        break;
-      case NODE_VERTEX_COLOR:
-        svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_GEOMETRY_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
-        }
-        break;
-      case NODE_GEOMETRY_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
-        }
-        break;
-      case NODE_SET_DISPLACEMENT:
-        svm_node_set_displacement<node_feature_mask>(kg, sd, stack, node.y);
-        break;
-      case NODE_DISPLACEMENT:
-        svm_node_displacement<node_feature_mask>(kg, sd, stack, node);
-        break;
-      case NODE_VECTOR_DISPLACEMENT:
-        offset = svm_node_vector_displacement<node_feature_mask>(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_IMAGE:
-        offset = svm_node_tex_image(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_IMAGE_BOX:
-        svm_node_tex_image_box(kg, sd, stack, node);
-        break;
-      case NODE_TEX_NOISE:
-        offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_SET_BUMP:
-        svm_node_set_bump<node_feature_mask>(kg, sd, stack, node);
-        break;
-      case NODE_ATTR_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_attr_bump_dx(kg, sd, stack, node);
-        }
-        break;
-      case NODE_ATTR_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_attr_bump_dy(kg, sd, stack, node);
-        }
-        break;
-      case NODE_VERTEX_COLOR_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
-        }
-        break;
-      case NODE_VERTEX_COLOR_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
-        }
-        break;
-      case NODE_TEX_COORD_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
-        }
-        break;
-      case NODE_TEX_COORD_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
-        }
-        break;
-      case NODE_CLOSURE_SET_NORMAL:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_set_normal(kg, sd, stack, node.y, node.z);
-        }
-        break;
-      case NODE_ENTER_BUMP_EVAL:
-        IF_KERNEL_NODES_FEATURE(BUMP_STATE)
-        {
-          svm_node_enter_bump_eval(kg, sd, stack, node.y);
-        }
-        break;
-      case NODE_LEAVE_BUMP_EVAL:
-        IF_KERNEL_NODES_FEATURE(BUMP_STATE)
-        {
-          svm_node_leave_bump_eval(kg, sd, stack, node.y);
-        }
-        break;
-      case NODE_HSV:
-        svm_node_hsv(kg, sd, stack, node);
-        break;
-
-      case NODE_CLOSURE_HOLDOUT:
-        svm_node_closure_holdout(sd, stack, node);
-        break;
-      case NODE_FRESNEL:
-        svm_node_fresnel(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_LAYER_WEIGHT:
-        svm_node_layer_weight(sd, stack, node);
-        break;
-      case NODE_CLOSURE_VOLUME:
-        IF_KERNEL_NODES_FEATURE(VOLUME)
-        {
-          svm_node_closure_volume<type>(kg, sd, stack, node);
-        }
-        break;
-      case NODE_PRINCIPLED_VOLUME:
-        IF_KERNEL_NODES_FEATURE(VOLUME)
-        {
-          offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
-        }
-        break;
-      case NODE_MATH:
-        svm_node_math(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_VECTOR_MATH:
-        offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_RGB_RAMP:
-        offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
-        break;
-      case NODE_GAMMA:
-        svm_node_gamma(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_BRIGHTCONTRAST:
-        svm_node_brightness(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_LIGHT_PATH:
-        svm_node_light_path<node_feature_mask>(kg, state, sd, stack, node.y, node.z, path_flag);
-        break;
-      case NODE_OBJECT_INFO:
-        svm_node_object_info(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_PARTICLE_INFO:
-        svm_node_particle_info(kg, sd, stack, node.y, node.z);
-        break;
+      SVM_CASE(NODE_CLOSURE_BSDF)
+      offset = svm_node_closure_bsdf<node_feature_mask, type>(
+          kg, sd, stack, node, path_flag, offset);
+      break;
+      SVM_CASE(NODE_CLOSURE_EMISSION)
+      IF_KERNEL_NODES_FEATURE(EMISSION)
+      {
+        svm_node_closure_emission(sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_CLOSURE_BACKGROUND)
+      IF_KERNEL_NODES_FEATURE(EMISSION)
+      {
+        svm_node_closure_background(sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_CLOSURE_SET_WEIGHT)
+      svm_node_closure_set_weight(sd, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_CLOSURE_WEIGHT)
+      svm_node_closure_weight(sd, stack, node.y);
+      break;
+      SVM_CASE(NODE_EMISSION_WEIGHT)
+      IF_KERNEL_NODES_FEATURE(EMISSION)
+      {
+        svm_node_emission_weight(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_MIX_CLOSURE)
+      svm_node_mix_closure(sd, stack, node);
+      break;
+      SVM_CASE(NODE_JUMP_IF_ZERO)
+      if (stack_load_float(stack, node.z) <= 0.0f)
+        offset += node.y;
+      break;
+      SVM_CASE(NODE_JUMP_IF_ONE)
+      if (stack_load_float(stack, node.z) >= 1.0f)
+        offset += node.y;
+      break;
+      SVM_CASE(NODE_GEOMETRY)
+      svm_node_geometry(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_CONVERT)
+      svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_TEX_COORD)
+      offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+      break;
+      SVM_CASE(NODE_VALUE_F)
+      svm_node_value_f(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_VALUE_V)
+      offset = svm_node_value_v(kg, sd, stack, node.y, offset);
+      break;
+      SVM_CASE(NODE_ATTR)
+      svm_node_attr<node_feature_mask>(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_VERTEX_COLOR)
+      svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_GEOMETRY_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+      }
+      break;
+      SVM_CASE(NODE_GEOMETRY_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+      }
+      break;
+      SVM_CASE(NODE_SET_DISPLACEMENT)
+      svm_node_set_displacement<node_feature_mask>(kg, sd, stack, node.y);
+      break;
+      SVM_CASE(NODE_DISPLACEMENT)
+      svm_node_displacement<node_feature_mask>(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_VECTOR_DISPLACEMENT)
+      offset = svm_node_vector_displacement<node_feature_mask>(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_IMAGE)
+      offset = svm_node_tex_image(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_IMAGE_BOX)
+      svm_node_tex_image_box(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_NOISE)
+      offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_SET_BUMP)
+      svm_node_set_bump<node_feature_mask>(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_ATTR_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_attr_bump_dx(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_ATTR_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_attr_bump_dy(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_VERTEX_COLOR_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+      }
+      break;
+      SVM_CASE(NODE_VERTEX_COLOR_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+      }
+      break;
+      SVM_CASE(NODE_TEX_COORD_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
+      }
+      break;
+      SVM_CASE(NODE_TEX_COORD_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
+      }
+      break;
+      SVM_CASE(NODE_CLOSURE_SET_NORMAL)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_set_normal(kg, sd, stack, node.y, node.z);
+      }
+      break;
+      SVM_CASE(NODE_ENTER_BUMP_EVAL)
+      IF_KERNEL_NODES_FEATURE(BUMP_STATE)
+      {
+        svm_node_enter_bump_eval(kg, sd, stack, node.y);
+      }
+      break;
+      SVM_CASE(NODE_LEAVE_BUMP_EVAL)
+      IF_KERNEL_NODES_FEATURE(BUMP_STATE)
+      {
+        svm_node_leave_bump_eval(kg, sd, stack, node.y);
+      }
+      break;
+      SVM_CASE(NODE_HSV)
+      svm_node_hsv(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_CLOSURE_HOLDOUT)
+      svm_node_closure_holdout(sd, stack, node);
+      break;
+      SVM_CASE(NODE_FRESNEL)
+      svm_node_fresnel(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_LAYER_WEIGHT)
+      svm_node_layer_weight(sd, stack, node);
+      break;
+      SVM_CASE(NODE_CLOSURE_VOLUME)
+      IF_KERNEL_NODES_FEATURE(VOLUME)
+      {
+        svm_node_closure_volume<type>(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_PRINCIPLED_VOLUME)
+      IF_KERNEL_NODES_FEATURE(VOLUME)
+      {
+        offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
+      }
+      break;
+      SVM_CASE(NODE_MATH)
+      svm_node_math(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_VECTOR_MATH)
+      offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_RGB_RAMP)
+      offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_GAMMA)
+      svm_node_gamma(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_BRIGHTCONTRAST)
+      svm_node_brightness(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_LIGHT_PATH)
+      svm_node_light_path<node_feature_mask>(kg, state, sd, stack, node.y, node.z, path_flag);
+      break;
+      SVM_CASE(NODE_OBJECT_INFO)
+      svm_node_object_info(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_PARTICLE_INFO)
+      svm_node_particle_info(kg, sd, stack, node.y, node.z);
+      break;
 #if defined(__HAIR__)
-      case NODE_HAIR_INFO:
-        svm_node_hair_info(kg, sd, stack, node.y, node.z);
-        break;
+      SVM_CASE(NODE_HAIR_INFO)
+      svm_node_hair_info(kg, sd, stack, node.y, node.z);
+      break;
 #endif
 #if defined(__POINTCLOUD__)
-      case NODE_POINT_INFO:
-        svm_node_point_info(kg, sd, stack, node.y, node.z);
-        break;
+      SVM_CASE(NODE_POINT_INFO)
+      svm_node_point_info(kg, sd, stack, node.y, node.z);
+      break;
 #endif
-      case NODE_TEXTURE_MAPPING:
-        offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
-        break;
-      case NODE_MAPPING:
-        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_MIN_MAX:
-        offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
-        break;
-      case NODE_CAMERA:
-        svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_TEX_ENVIRONMENT:
-        svm_node_tex_environment(kg, sd, stack, node);
-        break;
-      case NODE_TEX_SKY:
-        offset = svm_node_tex_sky(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_GRADIENT:
-        svm_node_tex_gradient(sd, stack, node);
-        break;
-      case NODE_TEX_VORONOI:
-        offset = svm_node_tex_voronoi<node_feature_mask>(
-            kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_TEX_MUSGRAVE:
-        offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_TEX_WAVE:
-        offset = svm_node_tex_wave(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_MAGIC:
-        offset = svm_node_tex_magic(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_CHECKER:
-        svm_node_tex_checker(kg, sd, stack, node);
-        break;
-      case NODE_TEX_BRICK:
-        offset = svm_node_tex_brick(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_WHITE_NOISE:
-        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_NORMAL:
-        offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_LIGHT_FALLOFF:
-        svm_node_light_falloff(sd, stack, node);
-        break;
-      case NODE_IES:
-        svm_node_ies(kg, sd, stack, node);
-        break;
-      case NODE_RGB_CURVES:
-      case NODE_VECTOR_CURVES:
-        offset = svm_node_curves(kg, sd, stack, node, offset);
-        break;
-      case NODE_FLOAT_CURVE:
-        offset = svm_node_curve(kg, sd, stack, node, offset);
-        break;
-      case NODE_TANGENT:
-        svm_node_tangent(kg, sd, stack, node);
-        break;
-      case NODE_NORMAL_MAP:
-        svm_node_normal_map(kg, sd, stack, node);
-        break;
-      case NODE_INVERT:
-        svm_node_invert(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_MIX:
-        offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_SEPARATE_COLOR:
-        svm_node_separate_color(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_COMBINE_COLOR:
-        svm_node_combine_color(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_SEPARATE_VECTOR:
-        svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_COMBINE_VECTOR:
-        svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_SEPARATE_HSV:
-        offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_COMBINE_HSV:
-        offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_VECTOR_ROTATE:
-        svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_VECTOR_TRANSFORM:
-        svm_node_vector_transform(kg, sd, stack, node);
-        break;
-      case NODE_WIREFRAME:
-        svm_node_wireframe(kg, sd, stack, node);
-        break;
-      case NODE_WAVELENGTH:
-        svm_node_wavelength(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_BLACKBODY:
-        svm_node_blackbody(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_MAP_RANGE:
-        offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_VECTOR_MAP_RANGE:
-        offset = svm_node_vector_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_CLAMP:
-        offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
+      SVM_CASE(NODE_TEXTURE_MAPPING)
+      offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
+      break;
+      SVM_CASE(NODE_MAPPING)
+      svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_MIN_MAX)
+      offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
+      break;
+      SVM_CASE(NODE_CAMERA)
+      svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_TEX_ENVIRONMENT)
+      svm_node_tex_environment(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_SKY)
+      offset = svm_node_tex_sky(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_GRADIENT)
+      svm_node_tex_gradient(sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_VORONOI)
+      offset = svm_node_tex_voronoi<node_feature_mask>(
+          kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_TEX_MUSGRAVE)
+      offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_TEX_WAVE)
+      offset = svm_node_tex_wave(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_MAGIC)
+      offset = svm_node_tex_magic(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_CHECKER)
+      svm_node_tex_checker(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_BRICK)
+      offset = svm_node_tex_brick(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_WHITE_NOISE)
+      svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_NORMAL)
+      offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_LIGHT_FALLOFF)
+      svm_node_light_falloff(sd, stack, node);
+      break;
+      SVM_CASE(NODE_IES)
+      svm_node_ies(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_CURVES)
+      offset = svm_node_curves(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_FLOAT_CURVE)
+      offset = svm_node_curve(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TANGENT)
+      svm_node_tangent(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_NORMAL_MAP)
+      svm_node_normal_map(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_INVERT)
+      svm_node_invert(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_MIX)
+      offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_SEPARATE_COLOR)
+      svm_node_separate_color(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_COMBINE_COLOR)
+      svm_node_combine_color(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_SEPARATE_VECTOR)
+      svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_COMBINE_VECTOR)
+      svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_SEPARATE_HSV)
+      offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_COMBINE_HSV)
+      offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_VECTOR_ROTATE)
+      svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_VECTOR_TRANSFORM)
+      svm_node_vector_transform(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_WIREFRAME)
+      svm_node_wireframe(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_WAVELENGTH)
+      svm_node_wavelength(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_BLACKBODY)
+      svm_node_blackbody(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_MAP_RANGE)
+      offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_VECTOR_MAP_RANGE)
+      offset = svm_node_vector_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_CLAMP)
+      offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
 #ifdef __SHADER_RAYTRACE__
-      case NODE_BEVEL:
-        svm_node_bevel<node_feature_mask>(kg, state, sd, stack, node);
-        break;
-      case NODE_AMBIENT_OCCLUSION:
-        svm_node_ao<node_feature_mask>(kg, state, sd, stack, node);
-        break;
+      SVM_CASE(NODE_BEVEL)
+      svm_node_bevel<node_feature_mask>(kg, state, sd, stack, node);
+      break;
+      SVM_CASE(NODE_AMBIENT_OCCLUSION)
+      svm_node_ao<node_feature_mask>(kg, state, sd, stack, node);
+      break;
 #endif
 
-      case NODE_TEX_VOXEL:
-        IF_KERNEL_NODES_FEATURE(VOLUME)
-        {
-          offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
-        }
-        break;
-      case NODE_AOV_START:
-        if (!svm_node_aov_check(path_flag, render_buffer)) {
-          return;
-        }
-        break;
-      case NODE_AOV_COLOR:
-        svm_node_aov_color<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
-        break;
-      case NODE_AOV_VALUE:
-        svm_node_aov_value<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
-        break;
+      SVM_CASE(NODE_TEX_VOXEL)
+      IF_KERNEL_NODES_FEATURE(VOLUME)
+      {
+        offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
+      }
+      break;
+      SVM_CASE(NODE_AOV_START)
+      if (!svm_node_aov_check(path_flag, render_buffer)) {
+        return;
+      }
+      break;
+      SVM_CASE(NODE_AOV_COLOR)
+      svm_node_aov_color<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
+      break;
+      SVM_CASE(NODE_AOV_VALUE)
+      svm_node_aov_value<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
+      break;
+      SVM_CASE(NODE_MIX_COLOR)
+      svm_node_mix_color(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_MIX_FLOAT)
+      svm_node_mix_float(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_MIX_VECTOR)
+      svm_node_mix_vector(sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_MIX_VECTOR_NON_UNIFORM)
+      svm_node_mix_vector_non_uniform(sd, stack, node.y, node.z);
+      break;
       default:
         kernel_assert(!"Unknown node type was passed to the SVM machine");
         return;
diff --git a/intern/cycles/kernel/svm/tex_coord.h b/intern/cycles/kernel/svm/tex_coord.h
index d9138796c45..8154c542e6f 100644
--- a/intern/cycles/kernel/svm/tex_coord.h
+++ b/intern/cycles/kernel/svm/tex_coord.h
@@ -106,7 +106,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dx(KernelGlobals kg,
 
   switch (type) {
     case NODE_TEXCO_OBJECT: {
-      data = sd->P + sd->dP.dx;
+      data = svm_node_bump_P_dx(sd);
       if (node.w == 0) {
         if (sd->object != OBJECT_NONE) {
           object_inverse_position_transform(kg, sd, &data);
@@ -130,17 +130,17 @@ ccl_device_noinline int svm_node_tex_coord_bump_dx(KernelGlobals kg,
       Transform tfm = kernel_data.cam.worldtocamera;
 
       if (sd->object != OBJECT_NONE)
-        data = transform_point(&tfm, sd->P + sd->dP.dx);
+        data = transform_point(&tfm, svm_node_bump_P_dx(sd));
       else
-        data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
+        data = transform_point(&tfm, svm_node_bump_P_dx(sd) + camera_position(kg));
       break;
     }
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f));
+        data = camera_world_to_ndc(kg, sd, sd->ray_P);
       else
-        data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
+        data = camera_world_to_ndc(kg, sd, svm_node_bump_P_dx(sd));
       data.z = 0.0f;
       break;
     }
@@ -160,7 +160,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dx(KernelGlobals kg,
       break;
     }
     case NODE_TEXCO_VOLUME_GENERATED: {
-      data = sd->P + sd->dP.dx;
+      data = svm_node_bump_P_dx(sd);
 
 #  ifdef __VOLUME__
       if (sd->object != OBJECT_NONE)
@@ -191,7 +191,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dy(KernelGlobals kg,
 
   switch (type) {
     case NODE_TEXCO_OBJECT: {
-      data = sd->P + sd->dP.dy;
+      data = svm_node_bump_P_dy(sd);
       if (node.w == 0) {
         if (sd->object != OBJECT_NONE) {
           object_inverse_position_transform(kg, sd, &data);
@@ -215,17 +215,17 @@ ccl_device_noinline int svm_node_tex_coord_bump_dy(KernelGlobals kg,
       Transform tfm = kernel_data.cam.worldtocamera;
 
       if (sd->object != OBJECT_NONE)
-        data = transform_point(&tfm, sd->P + sd->dP.dy);
+        data = transform_point(&tfm, svm_node_bump_P_dy(sd));
       else
-        data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
+        data = transform_point(&tfm, svm_node_bump_P_dy(sd) + camera_position(kg));
       break;
     }
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f));
+        data = camera_world_to_ndc(kg, sd, sd->ray_P);
       else
-        data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
+        data = camera_world_to_ndc(kg, sd, svm_node_bump_P_dy(sd));
       data.z = 0.0f;
       break;
     }
@@ -245,7 +245,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dy(KernelGlobals kg,
       break;
     }
     case NODE_TEXCO_VOLUME_GENERATED: {
-      data = sd->P + sd->dP.dy;
+      data = svm_node_bump_P_dy(sd);
 
 #  ifdef __VOLUME__
       if (sd->object != OBJECT_NONE)
diff --git a/intern/cycles/kernel/svm/types.h b/intern/cycles/kernel/svm/types.h
index 82109ec4c4f..9dd8f196e0f 100644
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@@ -12,109 +12,14 @@ CCL_NAMESPACE_BEGIN
 /* SVM stack offsets with this value indicate that it's not on the stack */
 #define SVM_STACK_INVALID 255
 
-#define SVM_BUMP_EVAL_STATE_SIZE 9
+#define SVM_BUMP_EVAL_STATE_SIZE 4
 
 /* Nodes */
 
 typedef enum ShaderNodeType {
-  NODE_END = 0,
-  NODE_SHADER_JUMP,
-  NODE_CLOSURE_BSDF,
-  NODE_CLOSURE_EMISSION,
-  NODE_CLOSURE_BACKGROUND,
-  NODE_CLOSURE_SET_WEIGHT,
-  NODE_CLOSURE_WEIGHT,
-  NODE_EMISSION_WEIGHT,
-  NODE_MIX_CLOSURE,
-  NODE_JUMP_IF_ZERO,
-  NODE_JUMP_IF_ONE,
-  NODE_GEOMETRY,
-  NODE_CONVERT,
-  NODE_TEX_COORD,
-  NODE_VALUE_F,
-  NODE_VALUE_V,
-  NODE_ATTR,
-  NODE_VERTEX_COLOR,
-  NODE_GEOMETRY_BUMP_DX,
-  NODE_GEOMETRY_BUMP_DY,
-  NODE_SET_DISPLACEMENT,
-  NODE_DISPLACEMENT,
-  NODE_VECTOR_DISPLACEMENT,
-  NODE_TEX_IMAGE,
-  NODE_TEX_IMAGE_BOX,
-  NODE_TEX_NOISE,
-  NODE_SET_BUMP,
-  NODE_ATTR_BUMP_DX,
-  NODE_ATTR_BUMP_DY,
-  NODE_VERTEX_COLOR_BUMP_DX,
-  NODE_VERTEX_COLOR_BUMP_DY,
-  NODE_TEX_COORD_BUMP_DX,
-  NODE_TEX_COORD_BUMP_DY,
-  NODE_CLOSURE_SET_NORMAL,
-  NODE_ENTER_BUMP_EVAL,
-  NODE_LEAVE_BUMP_EVAL,
-  NODE_HSV,
-  NODE_CLOSURE_HOLDOUT,
-  NODE_FRESNEL,
-  NODE_LAYER_WEIGHT,
-  NODE_CLOSURE_VOLUME,
-  NODE_PRINCIPLED_VOLUME,
-  NODE_MATH,
-  NODE_VECTOR_MATH,
-  NODE_RGB_RAMP,
-  NODE_GAMMA,
-  NODE_BRIGHTCONTRAST,
-  NODE_LIGHT_PATH,
-  NODE_OBJECT_INFO,
-  NODE_PARTICLE_INFO,
-  NODE_HAIR_INFO,
-  NODE_POINT_INFO,
-  NODE_TEXTURE_MAPPING,
-  NODE_MAPPING,
-  NODE_MIN_MAX,
-  NODE_CAMERA,
-  NODE_TEX_ENVIRONMENT,
-  NODE_TEX_SKY,
-  NODE_TEX_GRADIENT,
-  NODE_TEX_VORONOI,
-  NODE_TEX_MUSGRAVE,
-  NODE_TEX_WAVE,
-  NODE_TEX_MAGIC,
-  NODE_TEX_CHECKER,
-  NODE_TEX_BRICK,
-  NODE_TEX_WHITE_NOISE,
-  NODE_NORMAL,
-  NODE_LIGHT_FALLOFF,
-  NODE_IES,
-  NODE_RGB_CURVES,
-  NODE_VECTOR_CURVES,
-  NODE_TANGENT,
-  NODE_NORMAL_MAP,
-  NODE_INVERT,
-  NODE_MIX,
-  NODE_SEPARATE_COLOR,
-  NODE_COMBINE_COLOR,
-  NODE_SEPARATE_VECTOR,
-  NODE_COMBINE_VECTOR,
-  NODE_SEPARATE_HSV,
-  NODE_COMBINE_HSV,
-  NODE_VECTOR_ROTATE,
-  NODE_VECTOR_TRANSFORM,
-  NODE_WIREFRAME,
-  NODE_WAVELENGTH,
-  NODE_BLACKBODY,
-  NODE_MAP_RANGE,
-  NODE_VECTOR_MAP_RANGE,
-  NODE_CLAMP,
-  NODE_BEVEL,
-  NODE_AMBIENT_OCCLUSION,
-  NODE_TEX_VOXEL,
-  NODE_AOV_START,
-  NODE_AOV_COLOR,
-  NODE_AOV_VALUE,
-  NODE_FLOAT_CURVE,
-  /* NOTE: for best OpenCL performance, item definition in the enum must
-   * match the switch case order in `svm.h`. */
+#define SHADER_NODE_TYPE(name) name,
+#include "node_types_template.h"
+  NODE_NUM
 } ShaderNodeType;
 
 typedef enum NodeAttributeOutputType {
@@ -228,7 +133,7 @@ typedef enum NodeMix {
   NODE_MIX_HUE,
   NODE_MIX_SAT,
   NODE_MIX_VAL,
-  NODE_MIX_COLOR,
+  NODE_MIX_COL,
   NODE_MIX_SOFT,
   NODE_MIX_LINEAR,
   NODE_MIX_CLAMP /* used for the clamp UI option */
diff --git a/intern/cycles/kernel/svm/voronoi.h b/intern/cycles/kernel/svm/voronoi.h
index 4ff1047aab7..53c1bda0904 100644
--- a/intern/cycles/kernel/svm/voronoi.h
+++ b/intern/cycles/kernel/svm/voronoi.h
@@ -1079,7 +1079,7 @@ ccl_device_noinline int svm_node_tex_voronoi(KernelGlobals kg,
         default:
           kernel_assert(0);
       }
-      position_out = safe_divide_float3_float(position_out, scale);
+      position_out = safe_divide(position_out, scale);
       break;
     }
 
@@ -1126,7 +1126,7 @@ ccl_device_noinline int svm_node_tex_voronoi(KernelGlobals kg,
           default:
             kernel_assert(0);
         }
-        position_out_4d = safe_divide_float4_float(position_out_4d, scale);
+        position_out_4d = safe_divide(position_out_4d, scale);
         position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
         w_out = position_out_4d.w;
       }
diff --git a/intern/cycles/kernel/svm/wireframe.h b/intern/cycles/kernel/svm/wireframe.h
index e5fe08e5d04..91fadf4cfc4 100644
--- a/intern/cycles/kernel/svm/wireframe.h
+++ b/intern/cycles/kernel/svm/wireframe.h
@@ -14,6 +14,7 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline float wireframe(KernelGlobals kg,
                                   ccl_private ShaderData *sd,
+                                  const differential3 dP,
                                   float size,
                                   int pixel_size,
                                   ccl_private float3 *P)
@@ -46,8 +47,8 @@ ccl_device_inline float wireframe(KernelGlobals kg,
     if (pixel_size) {
       // Project the derivatives of P to the viewing plane defined
       // by I so we have a measure of how big is a pixel at this point
-      float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
-      float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
+      float pixelwidth_x = len(dP.dx - dot(dP.dx, sd->I) * sd->I);
+      float pixelwidth_y = len(dP.dy - dot(dP.dy, sd->I) * sd->I);
       // Take the average of both axis' length
       pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
     }
@@ -86,16 +87,17 @@ ccl_device_noinline void svm_node_wireframe(KernelGlobals kg,
   int pixel_size = (int)use_pixel_size;
 
   /* Calculate wireframe */
-  float f = wireframe(kg, sd, size, pixel_size, &sd->P);
+  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+  float f = wireframe(kg, sd, dP, size, pixel_size, &sd->P);
 
   /* TODO(sergey): Think of faster way to calculate derivatives. */
   if (bump_offset == NODE_BUMP_OFFSET_DX) {
-    float3 Px = sd->P - sd->dP.dx;
-    f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
+    float3 Px = sd->P - dP.dx;
+    f += (f - wireframe(kg, sd, dP, size, pixel_size, &Px)) / len(dP.dx);
   }
   else if (bump_offset == NODE_BUMP_OFFSET_DY) {
-    float3 Py = sd->P - sd->dP.dy;
-    f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
+    float3 Py = sd->P - dP.dy;
+    f += (f - wireframe(kg, sd, dP, size, pixel_size, &Py)) / len(dP.dy);
   }
 
   if (stack_valid(out_fac))
diff --git a/intern/cycles/kernel/tables.h b/intern/cycles/kernel/tables.h
index c1fdbba3fa7..399eea1e2b1 100644
--- a/intern/cycles/kernel/tables.h
+++ b/intern/cycles/kernel/tables.h
@@ -63,4 +63,57 @@ ccl_inline_constant float cie_colour_match[][3] = {
   {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}
 };
 
+/*
+ * The direction vectors for the first four dimensions of the Sobol
+ * sequence, stored with reversed-order bits.
+ *
+ * This is used in the Sobol-Burley sampler implementation.  We don't
+ * need more than four dimensions because we achieve higher dimensions
+ * with padding.  They're stored with reversed bits because we need
+ * them reversed for the fast hash-based Owen scrambling anyway, and
+ * this avoids doing that at run time.
+ */
+ccl_inline_constant unsigned int sobol_burley_table[4][32] = {
+  {
+    0x00000001, 0x00000002, 0x00000004, 0x00000008,
+    0x00000010, 0x00000020, 0x00000040, 0x00000080,
+    0x00000100, 0x00000200, 0x00000400, 0x00000800,
+    0x00001000, 0x00002000, 0x00004000, 0x00008000,
+    0x00010000, 0x00020000, 0x00040000, 0x00080000,
+    0x00100000, 0x00200000, 0x00400000, 0x00800000,
+    0x01000000, 0x02000000, 0x04000000, 0x08000000,
+    0x10000000, 0x20000000, 0x40000000, 0x80000000,
+  },
+  {
+    0x00000001, 0x00000003, 0x00000005, 0x0000000f,
+    0x00000011, 0x00000033, 0x00000055, 0x000000ff,
+    0x00000101, 0x00000303, 0x00000505, 0x00000f0f,
+    0x00001111, 0x00003333, 0x00005555, 0x0000ffff,
+    0x00010001, 0x00030003, 0x00050005, 0x000f000f,
+    0x00110011, 0x00330033, 0x00550055, 0x00ff00ff,
+    0x01010101, 0x03030303, 0x05050505, 0x0f0f0f0f,
+    0x11111111, 0x33333333, 0x55555555, 0xffffffff,
+  },
+  {
+    0x00000001, 0x00000003, 0x00000006, 0x00000009,
+    0x00000017, 0x0000003a, 0x00000071, 0x000000a3,
+    0x00000116, 0x00000339, 0x00000677, 0x000009aa,
+    0x00001601, 0x00003903, 0x00007706, 0x0000aa09,
+    0x00010117, 0x0003033a, 0x00060671, 0x000909a3,
+    0x00171616, 0x003a3939, 0x00717777, 0x00a3aaaa,
+    0x01170001, 0x033a0003, 0x06710006, 0x09a30009,
+    0x16160017, 0x3939003a, 0x77770071, 0xaaaa00a3,
+  },
+  {
+    0x00000001, 0x00000003, 0x00000004, 0x0000000a,
+    0x0000001f, 0x0000002e, 0x00000045, 0x000000c9,
+    0x0000011b, 0x000002a4, 0x0000079a, 0x00000b67,
+    0x0000101e, 0x0000302d, 0x00004041, 0x0000a0c3,
+    0x0001f104, 0x0002e28a, 0x000457df, 0x000c9bae,
+    0x0011a105, 0x002a7289, 0x0079e7db, 0x00b6dba4,
+    0x0100011a, 0x030002a7, 0x0400079e, 0x0a000b6d,
+    0x1f001001, 0x2e003003, 0x45004004, 0xc900a00a,
+  },
+};
+
 /* clang-format on */
diff --git a/intern/cycles/kernel/textures.h b/intern/cycles/kernel/textures.h
deleted file mode 100644
index d8ac9cbe51f..00000000000
--- a/intern/cycles/kernel/textures.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#ifndef KERNEL_TEX
-#  define KERNEL_TEX(type, name)
-#endif
-
-/* BVH2, not used for OptiX or Embree. */
-KERNEL_TEX(float4, __bvh_nodes)
-KERNEL_TEX(float4, __bvh_leaf_nodes)
-KERNEL_TEX(uint, __prim_type)
-KERNEL_TEX(uint, __prim_visibility)
-KERNEL_TEX(uint, __prim_index)
-KERNEL_TEX(uint, __prim_object)
-KERNEL_TEX(uint, __object_node)
-KERNEL_TEX(float2, __prim_time)
-
-/* objects */
-KERNEL_TEX(KernelObject, __objects)
-KERNEL_TEX(Transform, __object_motion_pass)
-KERNEL_TEX(DecomposedTransform, __object_motion)
-KERNEL_TEX(uint, __object_flag)
-KERNEL_TEX(float, __object_volume_step)
-KERNEL_TEX(uint, __object_prim_offset)
-
-/* cameras */
-KERNEL_TEX(DecomposedTransform, __camera_motion)
-
-/* triangles */
-KERNEL_TEX(uint, __tri_shader)
-KERNEL_TEX(packed_float3, __tri_vnormal)
-KERNEL_TEX(uint4, __tri_vindex)
-KERNEL_TEX(uint, __tri_patch)
-KERNEL_TEX(float2, __tri_patch_uv)
-KERNEL_TEX(packed_float3, __tri_verts)
-
-/* curves */
-KERNEL_TEX(KernelCurve, __curves)
-KERNEL_TEX(float4, __curve_keys)
-KERNEL_TEX(KernelCurveSegment, __curve_segments)
-
-/* patches */
-KERNEL_TEX(uint, __patches)
-
-/* pointclouds */
-KERNEL_TEX(float4, __points)
-KERNEL_TEX(uint, __points_shader)
-
-/* attributes */
-KERNEL_TEX(AttributeMap, __attributes_map)
-KERNEL_TEX(float, __attributes_float)
-KERNEL_TEX(float2, __attributes_float2)
-KERNEL_TEX(packed_float3, __attributes_float3)
-KERNEL_TEX(float4, __attributes_float4)
-KERNEL_TEX(uchar4, __attributes_uchar4)
-
-/* lights */
-KERNEL_TEX(KernelLightDistribution, __light_distribution)
-KERNEL_TEX(KernelLight, __lights)
-KERNEL_TEX(float2, __light_background_marginal_cdf)
-KERNEL_TEX(float2, __light_background_conditional_cdf)
-
-/* particles */
-KERNEL_TEX(KernelParticle, __particles)
-
-/* shaders */
-KERNEL_TEX(uint4, __svm_nodes)
-KERNEL_TEX(KernelShader, __shaders)
-
-/* lookup tables */
-KERNEL_TEX(float, __lookup_table)
-
-/* sobol */
-KERNEL_TEX(float, __sample_pattern_lut)
-
-/* image textures */
-KERNEL_TEX(TextureInfo, __texture_info)
-
-/* ies lights */
-KERNEL_TEX(float, __ies)
-
-#undef KERNEL_TEX
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index ad022716207..bd3791594e0 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -19,10 +19,6 @@
 
 #include "kernel/svm/types.h"
 
-#ifndef __KERNEL_GPU__
-#  define __KERNEL_CPU__
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 /* Constants */
@@ -51,57 +47,40 @@ CCL_NAMESPACE_BEGIN
 #define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024U
 #define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4U
 
-#ifdef __KERNEL_CPU__
-#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU
-#else
+#ifdef __KERNEL_GPU__
 #  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_GPU
+#else
+#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU
 #endif
 
 /* Kernel features */
-#define __SOBOL__
-#define __DPDU__
-#define __BACKGROUND__
+#define __AO__
 #define __CAUSTICS_TRICKS__
-#define __VISIBILITY_FLAG__
-#define __RAY_DIFFERENTIALS__
-#define __CAMERA_CLIPPING__
-#define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
-#define __PATCH_EVAL__
-#define __SHADOW_CATCHER__
 #define __DENOISING_FEATURES__
-#define __SHADER_RAYTRACE__
-#define __AO__
-#define __PASSES__
+#define __DPDU__
 #define __HAIR__
+#define __OBJECT_MOTION__
+#define __PASSES__
+#define __PATCH_EVAL__
 #define __POINTCLOUD__
+#define __RAY_DIFFERENTIALS__
+#define __SHADER_RAYTRACE__
+#define __SHADOW_CATCHER__
+#define __SHADOW_RECORD_ALL__
+#define __SUBSURFACE__
 #define __SVM__
-#define __EMISSION__
-#define __HOLDOUT__
 #define __TRANSPARENT_SHADOWS__
-#define __BACKGROUND_MIS__
-#define __LAMP_MIS__
-#define __CAMERA_MOTION__
-#define __OBJECT_MOTION__
-#define __BAKING__
-#define __PRINCIPLED__
-#define __SUBSURFACE__
+#define __VISIBILITY_FLAG__
 #define __VOLUME__
-#define __CMJ__
-#define __SHADOW_RECORD_ALL__
-#define __BRANCHED_PATH__
 
 /* Device specific features */
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
 #  define __VOLUME_RECORD_ALL__
-#endif /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_GPU_RAYTRACING__
-#  undef __BAKING__
-#endif /* __KERNEL_GPU_RAYTRACING__ */
+#endif /* !__KERNEL_GPU__ */
 
 /* MNEE currently causes "Compute function exceeds available temporary registers"
  * on Metal, disabled for now. */
@@ -111,9 +90,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Scene-based selective features compilation. */
 #ifdef __KERNEL_FEATURES__
-#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_CAMERA_MOTION)
-#    undef __CAMERA_MOTION__
-#  endif
 #  if !(__KERNEL_FEATURES & KERNEL_FEATURE_OBJECT_MOTION)
 #    undef __OBJECT_MOTION__
 #  endif
@@ -129,9 +105,6 @@ CCL_NAMESPACE_BEGIN
 #  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE)
 #    undef __SUBSURFACE__
 #  endif
-#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING)
-#    undef __BAKING__
-#  endif
 #  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION)
 #    undef __PATCH_EVAL__
 #  endif
@@ -141,9 +114,6 @@ CCL_NAMESPACE_BEGIN
 #  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SHADOW_CATCHER)
 #    undef __SHADOW_CATCHER__
 #  endif
-#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PRINCIPLED)
-#    undef __PRINCIPLED__
-#  endif
 #  if !(__KERNEL_FEATURES & KERNEL_FEATURE_DENOISING)
 #    undef __DENOISING_FEATURES__
 #  endif
@@ -159,36 +129,48 @@ CCL_NAMESPACE_BEGIN
 #  define __BVH_LOCAL__
 #endif
 
-/* Path Tracing
- * note we need to keep the u/v pairs at even values */
+/* Sampling Patterns */
 
+/* Unique numbers for sampling patterns in each bounce. */
 enum PathTraceDimension {
-  PRNG_FILTER_U = 0,
-  PRNG_FILTER_V = 1,
-  PRNG_LENS_U = 2,
-  PRNG_LENS_V = 3,
-  PRNG_TIME = 4,
-  PRNG_UNUSED_0 = 5,
-  PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */
-  PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */
-  PRNG_BASE_NUM = 10,
-
-  PRNG_BSDF_U = 0,
-  PRNG_BSDF_V = 1,
-  PRNG_LIGHT_U = 2,
-  PRNG_LIGHT_V = 3,
-  PRNG_LIGHT_TERMINATE = 4,
-  PRNG_TERMINATE = 5,
-  PRNG_PHASE_CHANNEL = 6,
-  PRNG_SCATTER_DISTANCE = 7,
-  PRNG_BOUNCE_NUM = 8,
-
-  PRNG_BEVEL_U = 6, /* reuse volume dimension, correlation won't harm */
-  PRNG_BEVEL_V = 7,
+  /* Init bounce */
+  PRNG_FILTER = 0,
+  PRNG_LENS = 1,
+  PRNG_TIME = 2,
+
+  /* Shade bounce */
+  PRNG_TERMINATE = 0,
+  PRNG_LIGHT = 1,
+  PRNG_LIGHT_TERMINATE = 2,
+  /* Surface */
+  PRNG_SURFACE_BSDF = 3,
+  PRNG_SURFACE_AO = 4,
+  PRNG_SURFACE_BEVEL = 5,
+  /* Volume */
+  PRNG_VOLUME_PHASE = 3,
+  PRNG_VOLUME_PHASE_CHANNEL = 4,
+  PRNG_VOLUME_SCATTER_DISTANCE = 5,
+  PRNG_VOLUME_OFFSET = 6,
+  PRNG_VOLUME_SHADE_OFFSET = 7,
+
+  /* Subsurface random walk bounces */
+  PRNG_SUBSURFACE_BSDF = 0,
+  PRNG_SUBSURFACE_PHASE_CHANNEL = 1,
+  PRNG_SUBSURFACE_SCATTER_DISTANCE = 2,
+  PRNG_SUBSURFACE_GUIDE_STRATEGY = 3,
+  PRNG_SUBSURFACE_GUIDE_DIRECTION = 4,
+
+  /* Subsurface disk bounce */
+  PRNG_SUBSURFACE_DISK = 0,
+  PRNG_SUBSURFACE_DISK_RESAMPLE = 1,
+
+  /* High enough number so we don't need to change it when adding new dimensions,
+   * low enough so there is no uint16_t overflow with many bounces. */
+  PRNG_BOUNCE_NUM = 16,
 };
 
 enum SamplingPattern {
-  SAMPLING_PATTERN_SOBOL = 0,
+  SAMPLING_PATTERN_SOBOL_BURLEY = 0,
   SAMPLING_PATTERN_PMJ = 1,
 
   SAMPLING_NUM_PATTERNS,
@@ -425,9 +407,9 @@ typedef enum CryptomatteType {
 } CryptomatteType;
 
 typedef struct BsdfEval {
-  float3 diffuse;
-  float3 glossy;
-  float3 sum;
+  Spectrum diffuse;
+  Spectrum glossy;
+  Spectrum sum;
 } BsdfEval;
 
 /* Closure Filter */
@@ -535,7 +517,8 @@ typedef struct RaySelfPrimitives {
 typedef struct Ray {
   float3 P;   /* origin */
   float3 D;   /* direction */
-  float t;    /* length of the ray */
+  float tmin; /* start distance */
+  float tmax; /* end distance */
   float time; /* time (for motion blur) */
 
   RaySelfPrimitives self;
@@ -672,12 +655,11 @@ typedef struct AttributeDescriptor {
 
 /* For looking up attributes on objects and geometry. */
 typedef struct AttributeMap {
-  uint id;       /* Global unique identifier. */
-  uint element;  /* AttributeElement. */
-  int offset;    /* Offset into __attributes global arrays. */
-  uint8_t type;  /* NodeAttributeType. */
-  uint8_t flags; /* AttributeFlag. */
-  uint8_t pad[2];
+  uint64_t id;      /* Global unique identifier. */
+  int offset;       /* Offset into __attributes global arrays. */
+  uint16_t element; /* AttributeElement. */
+  uint8_t type;     /* NodeAttributeType. */
+  uint8_t flags;    /* AttributeFlag. */
 } AttributeMap;
 
 /* Closure data */
@@ -720,7 +702,7 @@ typedef struct AttributeMap {
  * padded to be 16 bytes, while it's only 12 bytes on the GPU. */
 
 #define SHADER_CLOSURE_BASE \
-  float3 weight; \
+  Spectrum weight; \
   ClosureType type; \
   float sample_weight; \
   float3 N
@@ -729,10 +711,9 @@ typedef struct ccl_align(16) ShaderClosure
 {
   SHADER_CLOSURE_BASE;
 
-#ifdef __KERNEL_CPU__
-  float pad[2];
-#endif
-  float data[10];
+  /* Extra space for closures to store data, somewhat arbitrary but closures
+   * assert that their size fits. */
+  char pad[sizeof(Spectrum) * 2 + sizeof(float) * 4];
 }
 ShaderClosure;
 
@@ -885,10 +866,10 @@ typedef struct ccl_align(16) ShaderData
   float ray_length;
 
 #ifdef __RAY_DIFFERENTIALS__
-  /* differential of P. these are orthogonal to Ng, not N */
-  differential3 dP;
-  /* differential of I */
-  differential3 dI;
+  /* Radius of differential of P. */
+  float dP;
+  /* Radius of differential of I. */
+  float dI;
   /* differential of u, v */
   differential du;
   differential dv;
@@ -923,12 +904,12 @@ typedef struct ccl_align(16) ShaderData
   /* Closure data, we store a fixed array of closures */
   int num_closure;
   int num_closure_left;
-  float3 svm_closure_weight;
+  Spectrum svm_closure_weight;
 
   /* Closure weights summed directly, so we can evaluate
    * emission and shadow transparency with MAX_CLOSURE 0. */
-  float3 closure_emission_background;
-  float3 closure_transparent_extinction;
+  Spectrum closure_emission_background;
+  Spectrum closure_transparent_extinction;
 
   /* At the end so we can adjust size in ShaderDataTinyStorage. */
   struct ShaderClosure closure[MAX_CLOSURE];
@@ -959,7 +940,7 @@ ShaderDataCausticsStorage;
  * Used for decoupled direct/indirect light closure storage. */
 
 typedef struct ShaderVolumeClosure {
-  float3 weight;
+  Spectrum weight;
   float sample_weight;
   float g;
 } ShaderVolumeClosure;
@@ -1072,94 +1053,6 @@ typedef struct KernelCamera {
 } KernelCamera;
 static_assert_align(KernelCamera, 16);
 
-typedef struct KernelFilm {
-  float exposure;
-  int pass_flag;
-
-  int light_pass_flag;
-  int pass_stride;
-
-  int pass_combined;
-  int pass_depth;
-  int pass_position;
-  int pass_normal;
-  int pass_roughness;
-  int pass_motion;
-
-  int pass_motion_weight;
-  int pass_uv;
-  int pass_object_id;
-  int pass_material_id;
-
-  int pass_diffuse_color;
-  int pass_glossy_color;
-  int pass_transmission_color;
-
-  int pass_diffuse_indirect;
-  int pass_glossy_indirect;
-  int pass_transmission_indirect;
-  int pass_volume_indirect;
-
-  int pass_diffuse_direct;
-  int pass_glossy_direct;
-  int pass_transmission_direct;
-  int pass_volume_direct;
-
-  int pass_emission;
-  int pass_background;
-  int pass_ao;
-  float pass_alpha_threshold;
-
-  int pass_shadow;
-  float pass_shadow_scale;
-
-  int pass_shadow_catcher;
-  int pass_shadow_catcher_sample_count;
-  int pass_shadow_catcher_matte;
-
-  int filter_table_offset;
-
-  int cryptomatte_passes;
-  int cryptomatte_depth;
-  int pass_cryptomatte;
-
-  int pass_adaptive_aux_buffer;
-  int pass_sample_count;
-
-  int pass_mist;
-  float mist_start;
-  float mist_inv_depth;
-  float mist_falloff;
-
-  int pass_denoising_normal;
-  int pass_denoising_albedo;
-  int pass_denoising_depth;
-
-  int pass_aov_color;
-  int pass_aov_value;
-  int pass_lightgroup;
-
-  /* XYZ to rendering color space transform. float4 instead of float3 to
-   * ensure consistent padding/alignment across devices. */
-  float4 xyz_to_r;
-  float4 xyz_to_g;
-  float4 xyz_to_b;
-  float4 rgb_to_y;
-  /* Rec709 to rendering color space. */
-  float4 rec709_to_r;
-  float4 rec709_to_g;
-  float4 rec709_to_b;
-  int is_rec709;
-
-  int pass_bake_primitive;
-  int pass_bake_differential;
-
-  int use_approximate_shadow_catcher;
-
-  int pad1;
-} KernelFilm;
-static_assert_align(KernelFilm, 16);
-
 typedef struct KernelFilmConvert {
   int pass_offset;
   int pass_stride;
@@ -1201,108 +1094,6 @@ typedef struct KernelFilmConvert {
 } KernelFilmConvert;
 static_assert_align(KernelFilmConvert, 16);
 
-typedef struct KernelBackground {
-  /* only shader index */
-  int surface_shader;
-  int volume_shader;
-  float volume_step_size;
-  int transparent;
-  float transparent_roughness_squared_threshold;
-
-  /* portal sampling */
-  float portal_weight;
-  int num_portals;
-  int portal_offset;
-
-  /* sun sampling */
-  float sun_weight;
-  /* xyz store direction, w the angle. float4 instead of float3 is used
-   * to ensure consistent padding/alignment across devices. */
-  float4 sun;
-
-  /* map sampling */
-  float map_weight;
-  int map_res_x;
-  int map_res_y;
-
-  int use_mis;
-
-  int lightgroup;
-
-  /* Padding */
-  int pad1, pad2;
-} KernelBackground;
-static_assert_align(KernelBackground, 16);
-
-typedef struct KernelIntegrator {
-  /* emission */
-  int use_direct_light;
-  int num_distribution;
-  int num_all_lights;
-  float pdf_triangles;
-  float pdf_lights;
-  float light_inv_rr_threshold;
-
-  /* bounces */
-  int min_bounce;
-  int max_bounce;
-
-  int max_diffuse_bounce;
-  int max_glossy_bounce;
-  int max_transmission_bounce;
-  int max_volume_bounce;
-
-  /* AO bounces */
-  int ao_bounces;
-  float ao_bounces_distance;
-  float ao_bounces_factor;
-  float ao_additive_factor;
-
-  /* transparent */
-  int transparent_min_bounce;
-  int transparent_max_bounce;
-  int transparent_shadows;
-
-  /* caustics */
-  int caustics_reflective;
-  int caustics_refractive;
-  float filter_glossy;
-
-  /* seed */
-  int seed;
-
-  /* clamp */
-  float sample_clamp_direct;
-  float sample_clamp_indirect;
-
-  /* mis */
-  int use_lamp_mis;
-
-  /* caustics */
-  int use_caustics;
-
-  /* sampler */
-  int sampling_pattern;
-
-  /* volume render */
-  int use_volumes;
-  int volume_max_steps;
-  float volume_step_rate;
-
-  int has_shadow_catcher;
-  float scrambling_distance;
-
-  /* Closure filter. */
-  int filter_closures;
-
-  /* MIS debugging. */
-  int direct_light_sampling_type;
-
-  /* padding */
-  int pad1;
-} KernelIntegrator;
-static_assert_align(KernelIntegrator, 16);
-
 typedef enum KernelBVHLayout {
   BVH_LAYOUT_NONE = 0,
 
@@ -1320,36 +1111,25 @@ typedef enum KernelBVHLayout {
   BVH_LAYOUT_ALL = BVH_LAYOUT_BVH2 | BVH_LAYOUT_EMBREE | BVH_LAYOUT_OPTIX | BVH_LAYOUT_METAL,
 } KernelBVHLayout;
 
-typedef struct KernelBVH {
-  /* Own BVH */
-  int root;
-  int have_motion;
-  int have_curves;
-  int bvh_layout;
-  int use_bvh_steps;
-  int curve_subdivisions;
+/* Specialized struct that can become constants in dynamic compilation. */
+#define KERNEL_STRUCT_BEGIN(name, parent) struct name {
+#define KERNEL_STRUCT_END(name) \
+  } \
+  ; \
+  static_assert_align(name, 16);
 
-  /* Custom BVH */
-#ifdef __KERNEL_OPTIX__
-  OptixTraversableHandle scene;
-#elif defined __METALRT__
-  metalrt_as_type scene;
+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) type __unused_##name;
 #else
-#  ifdef __EMBREE__
-  RTCScene scene;
-#    ifndef __KERNEL_64_BIT__
-  int pad2;
-#    endif
-#  else
-  int scene, pad2;
-#  endif
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) type name;
 #endif
-} KernelBVH;
-static_assert_align(KernelBVH, 16);
+
+#include "kernel/data_template.h"
 
 typedef struct KernelTables {
   int beckmann_offset;
-  int pad1, pad2, pad3;
+  int filter_table_offset;
+  int pad1, pad2;
 } KernelTables;
 static_assert_align(KernelTables, 16);
 
@@ -1362,18 +1142,37 @@ typedef struct KernelBake {
 static_assert_align(KernelBake, 16);
 
 typedef struct KernelData {
+  /* Features and limits. */
   uint kernel_features;
   uint max_closures;
   uint max_shaders;
   uint volume_stack_size;
 
+  /* Always dynamic data members. */
   KernelCamera cam;
-  KernelFilm film;
-  KernelBackground background;
-  KernelIntegrator integrator;
-  KernelBVH bvh;
-  KernelTables tables;
   KernelBake bake;
+  KernelTables tables;
+
+  /* Potentially specialized data members. */
+#define KERNEL_STRUCT_BEGIN(name, parent) name parent;
+#include "kernel/data_template.h"
+
+  /* Device specific BVH. */
+#ifdef __KERNEL_OPTIX__
+  OptixTraversableHandle device_bvh;
+#elif defined __METALRT__
+  metalrt_as_type device_bvh;
+#else
+#  ifdef __EMBREE__
+  RTCScene device_bvh;
+#    ifndef __KERNEL_64_BIT__
+  int pad1;
+#    endif
+#  else
+  int device_bvh, pad1;
+#  endif
+#endif
+  int pad2, pad3;
 } KernelData;
 static_assert_align(KernelData, 16);
 
@@ -1557,10 +1356,14 @@ typedef struct KernelShaderEvalInput {
 } KernelShaderEvalInput;
 static_assert_align(KernelShaderEvalInput, 16);
 
-/* Pre-computed sample table sizes for PMJ02 sampler. */
+/* Pre-computed sample table sizes for PMJ02 sampler.
+ *
+ * NOTE: divisions *must* be a power of two, and patterns
+ * ideally should be as well.
+ */
 #define NUM_PMJ_DIVISIONS 32
 #define NUM_PMJ_SAMPLES ((NUM_PMJ_DIVISIONS) * (NUM_PMJ_DIVISIONS))
-#define NUM_PMJ_PATTERNS 1
+#define NUM_PMJ_PATTERNS 64
 
 /* Device kernels.
  *
@@ -1571,7 +1374,7 @@ static_assert_align(KernelShaderEvalInput, 16);
  * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
  * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
 
-typedef enum DeviceKernel {
+typedef enum DeviceKernel : int {
   DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
   DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
   DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
@@ -1667,42 +1470,38 @@ enum KernelFeatureFlag : uint32_t {
   KERNEL_FEATURE_HAIR = (1U << 12U),
   KERNEL_FEATURE_HAIR_THICK = (1U << 13U),
   KERNEL_FEATURE_OBJECT_MOTION = (1U << 14U),
-  KERNEL_FEATURE_CAMERA_MOTION = (1U << 15U),
 
   /* Denotes whether baking functionality is needed. */
-  KERNEL_FEATURE_BAKING = (1U << 16U),
+  KERNEL_FEATURE_BAKING = (1U << 15U),
 
   /* Use subsurface scattering materials. */
-  KERNEL_FEATURE_SUBSURFACE = (1U << 17U),
+  KERNEL_FEATURE_SUBSURFACE = (1U << 16U),
 
   /* Use volume materials. */
-  KERNEL_FEATURE_VOLUME = (1U << 18U),
+  KERNEL_FEATURE_VOLUME = (1U << 17U),
 
   /* Use OpenSubdiv patch evaluation */
-  KERNEL_FEATURE_PATCH_EVALUATION = (1U << 19U),
+  KERNEL_FEATURE_PATCH_EVALUATION = (1U << 18U),
 
   /* Use Transparent shadows */
-  KERNEL_FEATURE_TRANSPARENT = (1U << 20U),
+  KERNEL_FEATURE_TRANSPARENT = (1U << 19U),
 
   /* Use shadow catcher. */
-  KERNEL_FEATURE_SHADOW_CATCHER = (1U << 21U),
-
-  /* Per-uber shader usage flags. */
-  KERNEL_FEATURE_PRINCIPLED = (1U << 22U),
+  KERNEL_FEATURE_SHADOW_CATCHER = (1U << 29U),
 
   /* Light render passes. */
-  KERNEL_FEATURE_LIGHT_PASSES = (1U << 23U),
+  KERNEL_FEATURE_LIGHT_PASSES = (1U << 21U),
 
   /* Shadow render pass. */
-  KERNEL_FEATURE_SHADOW_PASS = (1U << 24U),
+  KERNEL_FEATURE_SHADOW_PASS = (1U << 22U),
 
   /* AO. */
-  KERNEL_FEATURE_AO_PASS = (1U << 25U),
-  KERNEL_FEATURE_AO_ADDITIVE = (1U << 26U),
+  KERNEL_FEATURE_AO_PASS = (1U << 23U),
+  KERNEL_FEATURE_AO_ADDITIVE = (1U << 24U),
   KERNEL_FEATURE_AO = (KERNEL_FEATURE_AO_PASS | KERNEL_FEATURE_AO_ADDITIVE),
 
   /* MNEE. */
-  KERNEL_FEATURE_MNEE = (1U << 27U),
+  KERNEL_FEATURE_MNEE = (1U << 25U),
 };
 
 /* Shader node feature mask, to specialize shader evaluation for kernels. */
@@ -1729,15 +1528,15 @@ enum KernelFeatureFlag : uint32_t {
 /* Must be constexpr on the CPU to avoid compile errors because the state types
  * are different depending on the main, shadow or null path. For GPU we don't have
  * C++17 everywhere so can't use it. */
-#ifdef __KERNEL_CPU__
+#ifdef __KERNEL_GPU__
+#  define IF_KERNEL_FEATURE(feature) if ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
+#  define IF_KERNEL_NODES_FEATURE(feature) \
+    if ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
+#else
 #  define IF_KERNEL_FEATURE(feature) \
     if constexpr ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
 #  define IF_KERNEL_NODES_FEATURE(feature) \
     if constexpr ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
-#else
-#  define IF_KERNEL_FEATURE(feature) if ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
-#  define IF_KERNEL_NODES_FEATURE(feature) \
-    if ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/util/color.h b/intern/cycles/kernel/util/color.h
index c85ef262d88..4983b9048d4 100644
--- a/intern/cycles/kernel/util/color.h
+++ b/intern/cycles/kernel/util/color.h
@@ -33,4 +33,19 @@ ccl_device float linear_rgb_to_gray(KernelGlobals kg, float3 c)
   return dot(c, float4_to_float3(kernel_data.film.rgb_to_y));
 }
 
+ccl_device_inline Spectrum rgb_to_spectrum(float3 rgb)
+{
+  return rgb;
+}
+
+ccl_device_inline float3 spectrum_to_rgb(Spectrum s)
+{
+  return s;
+}
+
+ccl_device float spectrum_to_gray(KernelGlobals kg, Spectrum c)
+{
+  return linear_rgb_to_gray(kg, spectrum_to_rgb(c));
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/util/differential.h b/intern/cycles/kernel/util/differential.h
index 3682e91ea66..aad9bb6bb22 100644
--- a/intern/cycles/kernel/util/differential.h
+++ b/intern/cycles/kernel/util/differential.h
@@ -101,53 +101,59 @@ ccl_device differential3 differential3_zero()
   return d;
 }
 
-/* Compact ray differentials that are just a scale to reduce memory usage and
- * access cost in GPU.
+/* Compact ray differentials that are just a radius to reduce memory usage and access cost
+ * on GPUs, basically cone tracing.
  *
- * See above for more accurate reference implementations.
- *
- * TODO: also store the more compact version in ShaderData and recompute where
- * needed? */
+ * See above for more accurate reference implementations of ray differentials. */
 
 ccl_device_forceinline float differential_zero_compact()
 {
   return 0.0f;
 }
 
-ccl_device_forceinline float differential_make_compact(const differential3 D)
+ccl_device_forceinline float differential_make_compact(const float dD)
 {
-  return 0.5f * (len(D.dx) + len(D.dy));
+  return dD;
 }
 
-ccl_device_forceinline void differential_transfer_compact(ccl_private differential3 *surface_dP,
-                                                          const float ray_dP,
-                                                          const float3 /* ray_D */,
-                                                          const float ray_dD,
-                                                          const float3 surface_Ng,
-                                                          const float ray_t)
+ccl_device_forceinline float differential_make_compact(const differential3 dD)
 {
-  /* ray differential transfer through homogeneous medium, to
-   * compute dPdx/dy at a shading point from the incoming ray */
-  float scale = ray_dP + ray_t * ray_dD;
+  return 0.5f * (len(dD.dx) + len(dD.dy));
+}
 
-  float3 dx, dy;
-  make_orthonormals(surface_Ng, &dx, &dy);
-  surface_dP->dx = dx * scale;
-  surface_dP->dy = dy * scale;
+ccl_device_forceinline float differential_incoming_compact(const float dD)
+{
+  return dD;
 }
 
-ccl_device_forceinline void differential_incoming_compact(ccl_private differential3 *dI,
-                                                          const float3 D,
-                                                          const float dD)
+ccl_device_forceinline float differential_transfer_compact(const float ray_dP,
+                                                           const float3 /* ray_D */,
+                                                           const float ray_dD,
+                                                           const float ray_t)
 {
-  /* compute dIdx/dy at a shading point, we just need to negate the
-   * differential of the ray direction */
+  return ray_dP + ray_t * ray_dD;
+}
 
+ccl_device_forceinline differential3 differential_from_compact(const float3 D, const float dD)
+{
   float3 dx, dy;
   make_orthonormals(D, &dx, &dy);
 
-  dI->dx = dD * dx;
-  dI->dy = dD * dy;
+  differential3 d;
+  d.dx = dD * dx;
+  d.dy = dD * dy;
+  return d;
+}
+
+ccl_device void differential_dudv_compact(ccl_private differential *du,
+                                          ccl_private differential *dv,
+                                          float3 dPdu,
+                                          float3 dPdv,
+                                          float dP,
+                                          float3 Ng)
+{
+  /* TODO: can we speed this up? */
+  differential_dudv(du, dv, dPdu, dPdv, differential_from_compact(Ng, dP), Ng);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/util/lookup_table.h b/intern/cycles/kernel/util/lookup_table.h
index e19e2ce5bd1..4db4dadab0e 100644
--- a/intern/cycles/kernel/util/lookup_table.h
+++ b/intern/cycles/kernel/util/lookup_table.h
@@ -15,11 +15,11 @@ ccl_device float lookup_table_read(KernelGlobals kg, float x, int offset, int si
   int nindex = min(index + 1, size - 1);
   float t = x - index;
 
-  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
+  float data0 = kernel_data_fetch(lookup_table, index + offset);
   if (t == 0.0f)
     return data0;
 
-  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
+  float data1 = kernel_data_fetch(lookup_table, nindex + offset);
   return (1.0f - t) * data0 + t * data1;
 }
 
diff --git a/intern/cycles/kernel/util/profiling.h b/intern/cycles/kernel/util/profiling.h
index 39cabd35967..b8afaf1166d 100644
--- a/intern/cycles/kernel/util/profiling.h
+++ b/intern/cycles/kernel/util/profiling.h
@@ -3,13 +3,13 @@
 
 #pragma once
 
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  include "util/profiling.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  define PROFILING_INIT(kg, event) \
     ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event)
 #  define PROFILING_EVENT(event) profiling_helper.set_event(event)
@@ -22,6 +22,6 @@ CCL_NAMESPACE_BEGIN
 #  define PROFILING_EVENT(event)
 #  define PROFILING_INIT_FOR_SHADER(kg, event)
 #  define PROFILING_SHADER(object, shader)
-#endif /* __KERNEL_CPU__ */
+#endif /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END