Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 18:59:20 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-21 15:55:54 +0300
commit: 08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree: 6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel
parent: fa6b1007bad065440950cd67deb16a04f368856f (diff)
281 files changed, 11938 insertions, 20277 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 0ce33c51778..4196539a9b1 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -22,68 +22,22 @@ set(INC_SYS
 
 )
 
-set(SRC_CPU_KERNELS
-  kernels/cpu/kernel.cpp
-  kernels/cpu/kernel_sse2.cpp
-  kernels/cpu/kernel_sse3.cpp
-  kernels/cpu/kernel_sse41.cpp
-  kernels/cpu/kernel_avx.cpp
-  kernels/cpu/kernel_avx2.cpp
-  kernels/cpu/kernel_split.cpp
-  kernels/cpu/kernel_split_sse2.cpp
-  kernels/cpu/kernel_split_sse3.cpp
-  kernels/cpu/kernel_split_sse41.cpp
-  kernels/cpu/kernel_split_avx.cpp
-  kernels/cpu/kernel_split_avx2.cpp
-  kernels/cpu/filter.cpp
-  kernels/cpu/filter_sse2.cpp
-  kernels/cpu/filter_sse3.cpp
-  kernels/cpu/filter_sse41.cpp
-  kernels/cpu/filter_avx.cpp
-  kernels/cpu/filter_avx2.cpp
+set(SRC_DEVICE_CPU
+  device/cpu/kernel.cpp
+  device/cpu/kernel_sse2.cpp
+  device/cpu/kernel_sse3.cpp
+  device/cpu/kernel_sse41.cpp
+  device/cpu/kernel_avx.cpp
+  device/cpu/kernel_avx2.cpp
 )
 
-set(SRC_CUDA_KERNELS
-  kernels/cuda/kernel.cu
-  kernels/cuda/kernel_split.cu
-  kernels/cuda/filter.cu
+set(SRC_DEVICE_CUDA
+  device/cuda/kernel.cu
 )
 
-set(SRC_OPENCL_KERNELS
-  kernels/opencl/kernel_adaptive_stopping.cl
-  kernels/opencl/kernel_adaptive_filter_x.cl
-  kernels/opencl/kernel_adaptive_filter_y.cl
-  kernels/opencl/kernel_adaptive_adjust_samples.cl
-  kernels/opencl/kernel_bake.cl
-  kernels/opencl/kernel_base.cl
-  kernels/opencl/kernel_displace.cl
-  kernels/opencl/kernel_background.cl
-  kernels/opencl/kernel_state_buffer_size.cl
-  kernels/opencl/kernel_split_bundle.cl
-  kernels/opencl/kernel_data_init.cl
-  kernels/opencl/kernel_path_init.cl
-  kernels/opencl/kernel_queue_enqueue.cl
-  kernels/opencl/kernel_scene_intersect.cl
-  kernels/opencl/kernel_lamp_emission.cl
-  kernels/opencl/kernel_do_volume.cl
-  kernels/opencl/kernel_indirect_background.cl
-  kernels/opencl/kernel_shader_setup.cl
-  kernels/opencl/kernel_shader_sort.cl
-  kernels/opencl/kernel_shader_eval.cl
-  kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
-  kernels/opencl/kernel_subsurface_scatter.cl
-  kernels/opencl/kernel_direct_lighting.cl
-  kernels/opencl/kernel_shadow_blocked_ao.cl
-  kernels/opencl/kernel_shadow_blocked_dl.cl
-  kernels/opencl/kernel_enqueue_inactive.cl
-  kernels/opencl/kernel_next_iteration_setup.cl
-  kernels/opencl/kernel_indirect_subsurface.cl
-  kernels/opencl/kernel_buffer_update.cl
-  kernels/opencl/filter.cl
-)
-
-set(SRC_OPTIX_KERNELS
-  kernels/optix/kernel_optix.cu
+set(SRC_DEVICE_OPTIX
+  device/optix/kernel.cu
+  device/optix/kernel_shader_raytrace.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -105,63 +59,56 @@ set(SRC_HEADERS
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
-  kernel_compat_cpu.h
-  kernel_compat_cuda.h
-  kernel_compat_optix.h
-  kernel_compat_opencl.h
   kernel_differential.h
   kernel_emission.h
   kernel_film.h
-  kernel_globals.h
   kernel_id_passes.h
   kernel_jitter.h
   kernel_light.h
   kernel_light_background.h
   kernel_light_common.h
+  kernel_lookup_table.h
   kernel_math.h
   kernel_montecarlo.h
   kernel_passes.h
-  kernel_path.h
-  kernel_path_branched.h
-  kernel_path_common.h
   kernel_path_state.h
-  kernel_path_surface.h
-  kernel_path_subsurface.h
-  kernel_path_volume.h
   kernel_profiling.h
   kernel_projection.h
-  kernel_queues.h
   kernel_random.h
   kernel_shader.h
-  kernel_shadow.h
-  kernel_subsurface.h
+  kernel_shadow_catcher.h
   kernel_textures.h
   kernel_types.h
-  kernel_volume.h
   kernel_work_stealing.h
   kernel_write_passes.h
 )
 
-set(SRC_KERNELS_CPU_HEADERS
-  kernel.h
-  kernels/cpu/kernel_cpu.h
-  kernels/cpu/kernel_cpu_impl.h
-  kernels/cpu/kernel_cpu_image.h
-  kernels/cpu/filter_cpu.h
-  kernels/cpu/filter_cpu_impl.h
+set(SRC_DEVICE_CPU_HEADERS
+  device/cpu/compat.h
+  device/cpu/image.h
+  device/cpu/globals.h
+  device/cpu/kernel.h
+  device/cpu/kernel_arch.h
+  device/cpu/kernel_arch_impl.h
 )
-
-set(SRC_KERNELS_CUDA_HEADERS
-  kernels/cuda/kernel_config.h
-  kernels/cuda/kernel_cuda_image.h
+set(SRC_DEVICE_GPU_HEADERS
+  device/gpu/image.h
+  device/gpu/kernel.h
+  device/gpu/parallel_active_index.h
+  device/gpu/parallel_prefix_sum.h
+  device/gpu/parallel_reduce.h
+  device/gpu/parallel_sorted_index.h
 )
 
-set(SRC_KERNELS_OPTIX_HEADERS
+set(SRC_DEVICE_CUDA_HEADERS
+  device/cuda/compat.h
+  device/cuda/config.h
+  device/cuda/globals.h
 )
 
-set(SRC_KERNELS_OPENCL_HEADERS
-  kernels/opencl/kernel_split_function.h
-  kernels/opencl/kernel_opencl_image.h
+set(SRC_DEVICE_OPTIX_HEADERS
+  device/optix/compat.h
+  device/optix/globals.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -259,25 +206,32 @@ set(SRC_GEOM_HEADERS
   geom/geom_object.h
   geom/geom_patch.h
   geom/geom_primitive.h
+  geom/geom_shader_data.h
   geom/geom_subd_triangle.h
   geom/geom_triangle.h
   geom/geom_triangle_intersect.h
   geom/geom_volume.h
 )
 
-set(SRC_FILTER_HEADERS
-  filter/filter.h
-  filter/filter_defines.h
-  filter/filter_features.h
-  filter/filter_features_sse.h
-  filter/filter_kernel.h
-  filter/filter_nlm_cpu.h
-  filter/filter_nlm_gpu.h
-  filter/filter_prefilter.h
-  filter/filter_reconstruction.h
-  filter/filter_transform.h
-  filter/filter_transform_gpu.h
-  filter/filter_transform_sse.h
+set(SRC_INTEGRATOR_HEADERS
+  integrator/integrator_init_from_bake.h
+  integrator/integrator_init_from_camera.h
+  integrator/integrator_intersect_closest.h
+  integrator/integrator_intersect_shadow.h
+  integrator/integrator_intersect_subsurface.h
+  integrator/integrator_intersect_volume_stack.h
+  integrator/integrator_megakernel.h
+  integrator/integrator_shade_background.h
+  integrator/integrator_shade_light.h
+  integrator/integrator_shade_shadow.h
+  integrator/integrator_shade_surface.h
+  integrator/integrator_shade_volume.h
+  integrator/integrator_state.h
+  integrator/integrator_state_flow.h
+  integrator/integrator_state_template.h
+  integrator/integrator_state_util.h
+  integrator/integrator_subsurface.h
+  integrator/integrator_volume_stack.h
 )
 
 set(SRC_UTIL_HEADERS
@@ -333,36 +287,6 @@ set(SRC_UTIL_HEADERS
   ../util/util_types_vector3_impl.h
 )
 
-set(SRC_SPLIT_HEADERS
-  split/kernel_adaptive_adjust_samples.h
-  split/kernel_adaptive_filter_x.h
-  split/kernel_adaptive_filter_y.h
-  split/kernel_adaptive_stopping.h
-  split/kernel_branched.h
-  split/kernel_buffer_update.h
-  split/kernel_data_init.h
-  split/kernel_direct_lighting.h
-  split/kernel_do_volume.h
-  split/kernel_enqueue_inactive.h
-  split/kernel_holdout_emission_blurring_pathtermination_ao.h
-  split/kernel_indirect_background.h
-  split/kernel_indirect_subsurface.h
-  split/kernel_lamp_emission.h
-  split/kernel_next_iteration_setup.h
-  split/kernel_path_init.h
-  split/kernel_queue_enqueue.h
-  split/kernel_scene_intersect.h
-  split/kernel_shader_setup.h
-  split/kernel_shader_sort.h
-  split/kernel_shader_eval.h
-  split/kernel_shadow_blocked_ao.h
-  split/kernel_shadow_blocked_dl.h
-  split/kernel_split_common.h
-  split/kernel_split_data.h
-  split/kernel_split_data_types.h
-  split/kernel_subsurface_scatter.h
-)
-
 set(LIB
 
 )
@@ -393,21 +317,17 @@ if(WITH_CYCLES_CUDA_BINARIES)
   endif()
 
   # build for each arch
-  set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
+  set(cuda_sources device/cuda/kernel.cu
     ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
+    ${SRC_DEVICE_GPU_HEADERS}
+    ${SRC_DEVICE_CUDA_HEADERS}
     ${SRC_BVH_HEADERS}
     ${SRC_SVM_HEADERS}
     ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
     ${SRC_CLOSURE_HEADERS}
     ${SRC_UTIL_HEADERS}
   )
-  set(cuda_filter_sources kernels/cuda/filter.cu
-    ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
-    ${SRC_FILTER_HEADERS}
-    ${SRC_UTIL_HEADERS}
-  )
   set(cuda_cubins)
 
   macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
@@ -427,7 +347,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       endif()
     endif()
 
-    set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+    set(cuda_kernel_src "/device/cuda/${name}.cu")
 
     set(cuda_flags ${flags}
       -D CCL_NAMESPACE_BEGIN=
@@ -435,7 +355,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -D NVCC
       -m ${CUDA_BITS}
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
-      -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
       --use_fast_math
       -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
@@ -523,14 +443,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
     endif()
     if(DEFINED cuda_nvcc_executable AND DEFINED cuda_toolkit_root_dir)
       # Compile regular kernel
-      CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} filter "" "${cuda_filter_sources}" FALSE)
       CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${cuda_sources}" FALSE)
 
-      if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
-        # Compile split kernel
-        CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel_split "-D __SPLIT__" "${cuda_sources}" FALSE)
-      endif()
-
       if(WITH_CYCLES_CUDA_BUILD_SERIAL)
         set(prev_arch ${arch})
       endif()
@@ -547,15 +461,15 @@ endif()
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
-  macro(CYCLES_OPTIX_KERNEL_ADD name flags)
-    set(input "kernels/optix/kernel_optix.cu")
+  macro(CYCLES_OPTIX_KERNEL_ADD name input flags)
     set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
 
     set(cuda_flags ${flags}
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
-      -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
       --use_fast_math
+      -Wno-deprecated-gpu-targets
       -o ${output})
 
     if(WITH_NANOVDB)
@@ -580,11 +494,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND ${CUBIN_CC_ENV}
@@ -603,11 +519,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND
@@ -624,8 +542,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
     delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
   endmacro()
 
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__")
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix
+    "device/optix/kernel.cu"
+    "")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix_shader_raytrace
+    "device/optix/kernel_shader_raytrace.cu"
+    "--keep-device-functions")
 
   add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
   cycles_set_solution_folder(cycles_kernel_optix)
@@ -659,62 +583,47 @@ if(WITH_COMPILER_ASAN)
   endif()
 endif()
 
-set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
 if(CXX_HAS_SSE)
-  set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-  set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-  set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 cycles_add_library(cycles_kernel "${LIB}"
-  ${SRC_CPU_KERNELS}
-  ${SRC_CUDA_KERNELS}
-  ${SRC_OPTIX_KERNELS}
-  ${SRC_OPENCL_KERNELS}
+  ${SRC_DEVICE_CPU}
+  ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_OPTIX}
   ${SRC_HEADERS}
-  ${SRC_KERNELS_CPU_HEADERS}
-  ${SRC_KERNELS_CUDA_HEADERS}
-  ${SRC_KERNELS_OPTIX_HEADERS}
-  ${SRC_KERNELS_OPENCL_HEADERS}
+  ${SRC_DEVICE_CPU_HEADERS}
+  ${SRC_DEVICE_GPU_HEADERS}
+  ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_OPTIX_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
-  ${SRC_FILTER_HEADERS}
   ${SRC_SVM_HEADERS}
   ${SRC_GEOM_HEADERS}
-  ${SRC_SPLIT_HEADERS}
+  ${SRC_INTEGRATOR_HEADERS}
 )
 
 source_group("bvh" FILES ${SRC_BVH_HEADERS})
 source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
-source_group("filter" FILES ${SRC_FILTER_HEADERS})
 source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
-source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
-source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
-source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
-source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
-source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
+source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
+source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
 source_group("svm" FILES ${SRC_SVM_HEADERS})
 
 if(WITH_CYCLES_CUDA)
@@ -724,31 +633,20 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
 
-# OpenCL kernel
-
-# set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
-# add_custom_command(
-#    OUTPUT ${KERNEL_PREPROCESSED}
-#    COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
-#    DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
-# add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
-# delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
+# Install kernel source for runtime compilation
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
-
 
 if(WITH_NANOVDB)
   set(SRC_NANOVDB_HEADERS
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index acf29cf1baf..539e9fd05fb 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -25,6 +25,8 @@
  * the code has been extended and modified to support more primitives and work
  * with CPU/CUDA/OpenCL. */
 
+#pragma once
+
 #ifdef __EMBREE__
 #  include "kernel/bvh/bvh_embree.h"
 #endif
@@ -152,13 +154,11 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray)
   return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
 }
 
-ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
                                           const Ray *ray,
                                           const uint visibility,
                                           Intersection *isect)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT);
-
 #ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -238,15 +238,13 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }
 
 #ifdef __BVH_LOCAL__
-ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
                                                 const Ray *ray,
                                                 LocalIntersection *local_isect,
                                                 int local_object,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF;
@@ -313,8 +311,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
         float3 dir = ray->D;
         float3 idir = ray->D;
         Transform ob_itfm;
-        rtc_ray.tfar = bvh_instance_motion_push(
-            kg, local_object, ray, &P, &dir, &idir, ray->t, &ob_itfm);
+        rtc_ray.tfar = ray->t *
+                       bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
         /* bvh_instance_motion_push() returns the inverse transform but
          * it's not needed here. */
         (void)ob_itfm;
@@ -353,15 +351,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      uint visibility,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)isect) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF;
@@ -401,17 +397,13 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
     ctx.isect_s = isect;
     ctx.max_hits = max_hits;
-    ctx.num_hits = 0;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
     kernel_embree_setup_ray(*ray, rtc_ray, visibility);
     rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
 
-    if (ctx.num_hits > max_hits) {
-      return true;
-    }
     *num_hits = ctx.num_hits;
-    return rtc_ray.tfar == -INFINITY;
+    return ctx.opaque_hit;
   }
 #    endif /* __EMBREE__ */
 
@@ -439,13 +431,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 #endif /* __SHADOW_RECORD_ALL__ */
 
 #ifdef __VOLUME__
-ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
                                                  const Ray *ray,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -498,14 +488,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
 #endif /* __VOLUME__ */
 
 #ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+ccl_device_intersect uint scene_intersect_volume_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
-
   if (!scene_intersect_valid(ray)) {
     return false;
   }
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
index 4605c3ea51d..092d770dcac 100644
--- a/intern/cycles/kernel/bvh/bvh_embree.h
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <embree3/rtcore_ray.h>
 #include <embree3/rtcore_scene.h>
 
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-// clang-format on
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
 
 #include "util/util_vector.h"
 
@@ -36,25 +35,29 @@ struct CCLIntersectContext {
     RAY_VOLUME_ALL = 4,
   } RayType;
 
-  KernelGlobals *kg;
+  const KernelGlobals *kg;
   RayType type;
 
   /* for shadow rays */
   Intersection *isect_s;
   int max_hits;
   int num_hits;
+  float max_t;
+  bool opaque_hit;
 
   /* for SSS Rays: */
   LocalIntersection *local_isect;
   int local_object_id;
   uint *lcg_state;
 
-  CCLIntersectContext(KernelGlobals *kg_, RayType type_)
+  CCLIntersectContext(const KernelGlobals *kg_, RayType type_)
   {
     kg = kg_;
     type = type_;
     max_hits = 1;
     num_hits = 0;
+    max_t = FLT_MAX;
+    opaque_hit = false;
     isect_s = NULL;
     local_isect = NULL;
     local_object_id = -1;
@@ -98,7 +101,7 @@ ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
   rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
 }
 
-ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_hit(const KernelGlobals *kg,
                                                  const RTCRay *ray,
                                                  const RTCHit *hit,
                                                  Intersection *isect)
@@ -123,7 +126,7 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
   isect->type = kernel_tex_fetch(__prim_type, isect->prim);
 }
 
-ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_sss_hit(const KernelGlobals *kg,
                                                      const RTCRay *ray,
                                                      const RTCHit *hit,
                                                      Intersection *isect,
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 4006c9c1632..90b9f410b29 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      LocalIntersection *local_isect,
                                      int local_object,
@@ -74,9 +74,9 @@ ccl_device_inline
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
     Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+    isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+    isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #endif
     object = local_object;
   }
@@ -196,7 +196,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          LocalIntersection *local_isect,
                                          int local_object,
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 5367bdb633c..15cd0f22213 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@
 
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
-ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(const KernelGlobals *kg,
                                                                 int node_addr,
                                                                 int child)
 {
@@ -28,7 +28,7 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
   return space;
 }
 
-ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(const KernelGlobals *kg,
                                                       const float3 P,
                                                       const float3 idir,
                                                       const float t,
@@ -76,7 +76,7 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(const KernelGlobals *kg,
                                                                const float3 P,
                                                                const float3 dir,
                                                                const float t,
@@ -102,7 +102,7 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg
   return tnear <= tfar;
 }
 
-ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(const KernelGlobals *kg,
                                                         const float3 P,
                                                         const float3 dir,
                                                         const float3 idir,
@@ -134,7 +134,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
   return mask;
 }
 
-ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(const KernelGlobals *kg,
                                               const float3 P,
                                               const float3 dir,
                                               const float3 idir,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 2e94b1d7c37..0ae36fccf9b 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint visibility,
@@ -68,10 +68,10 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-  int num_hits_in_instance = 0;
+  float t_world_to_instance = 1.0f;
 
   *num_hits = 0;
-  isect_array->t = tmax;
+  Intersection *isect = isect_array;
 
   /* traversal loop */
   do {
@@ -147,13 +147,14 @@ ccl_device_inline
 
             switch (p_type) {
               case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect, P, dir, isect_t, visibility, object, prim_addr);
                 break;
               }
 #if BVH_FEATURE(BVH_MOTION)
               case PRIMITIVE_MOTION_TRIANGLE: {
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 break;
               }
 #endif
@@ -163,8 +164,16 @@ ccl_device_inline
               case PRIMITIVE_CURVE_RIBBON:
               case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                hit = curve_intersect(
-                    kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                hit = curve_intersect(kg,
+                                      isect,
+                                      P,
+                                      dir,
+                                      isect_t,
+                                      visibility,
+                                      object,
+                                      prim_addr,
+                                      ray->time,
+                                      curve_type);
                 break;
               }
 #endif
@@ -176,27 +185,49 @@ ccl_device_inline
 
             /* shadow ray early termination */
             if (hit) {
+              /* Convert intersection distance to world space. */
+              isect->t /= t_world_to_instance;
+
               /* detect if this surface has a shader with transparent shadows */
 
               /* todo: optimize so primitive visibility flag indicates if
                * the primitive has a transparent shadow shader? */
-              const int flags = intersection_get_shader_flags(kg, isect_array);
+              const int flags = intersection_get_shader_flags(kg, isect);
 
-              /* if no transparent shadows, all light is blocked */
-              if (!(flags & SD_HAS_TRANSPARENT_SHADOW)) {
-                return true;
-              }
-              /* if maximum number of hits reached, block all light */
-              else if (*num_hits == max_hits) {
+              if (!(flags & SD_HAS_TRANSPARENT_SHADOW) || max_hits == 0) {
+                /* If no transparent shadows, all light is blocked and we can
+                 * stop immediately. */
                 return true;
               }
 
-              /* move on to next entry in intersections array */
-              isect_array++;
+              /* Increase the number of hits, possibly beyond max_hits, we will
+               * simply not record those and only keep the max_hits closest. */
               (*num_hits)++;
-              num_hits_in_instance++;
 
-              isect_array->t = isect_t;
+              if (*num_hits >= max_hits) {
+                /* If maximum number of hits reached, find the intersection with
+                 * the largest distance to potentially replace when another hit
+                 * is found. */
+                const int num_recorded_hits = min(max_hits, *num_hits);
+                float max_recorded_t = isect_array[0].t;
+                int max_recorded_hit = 0;
+
+                for (int i = 1; i < num_recorded_hits; i++) {
+                  if (isect_array[i].t > max_recorded_t) {
+                    max_recorded_t = isect_array[i].t;
+                    max_recorded_hit = i;
+                  }
+                }
+
+                isect = isect_array + max_recorded_hit;
+
+                /* Limit the ray distance and stop counting hits beyond this. */
+                isect_t = max_recorded_t * t_world_to_instance;
+              }
+              else {
+                /* Still have space for intersection, use next hit. */
+                isect = isect + 1;
+              }
             }
 
             prim_addr++;
@@ -207,13 +238,14 @@ ccl_device_inline
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+          t_world_to_instance = bvh_instance_motion_push(
+              kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+          t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
+          /* Convert intersection to object space. */
+          isect_t *= t_world_to_instance;
 
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -228,32 +260,19 @@ ccl_device_inline
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-
 #if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+      bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+      bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #endif
 
-        /* scale isect->t to adjust for instancing */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
+      /* Restore world space ray length. If max number of hits exceeded this
+       * distance is reduced to recorded only the closest hits. If not use
+       * the original ray length. */
+      isect_t = (max_hits && *num_hits > max_hits) ? isect->t : tmax;
 
       object = OBJECT_NONE;
+      t_world_to_instance = 1.0f;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
@@ -262,7 +281,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint visibility,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 89250a8d60a..a26d8c514f3 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -31,7 +31,7 @@
  * BVH_MOTION: motion blur rendering
  */
 
-ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint visibility)
@@ -136,7 +136,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
             case PRIMITIVE_TRIANGLE: {
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
+                if (triangle_intersect(
+                        kg, isect, P, dir, isect->t, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -149,7 +150,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
                 if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
+                        kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -166,8 +167,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
                 kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                const bool hit = curve_intersect(
-                    kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                const bool hit = curve_intersect(kg,
+                                                 isect,
+                                                 P,
+                                                 dir,
+                                                 isect->t,
+                                                 visibility,
+                                                 object,
+                                                 prim_addr,
+                                                 ray->time,
+                                                 curve_type);
                 if (hit) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -184,10 +193,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect->t = bvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+          isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+          isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
           ++stack_ptr;
@@ -218,7 +226,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 98e6ec25d15..6039e707fc3 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BVH_TYPES__
-#define __BVH_TYPES__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,5 +42,3 @@ CCL_NAMESPACE_BEGIN
 #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
 
 CCL_NAMESPACE_END
-
-#endif /* __BVH_TYPES__ */
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index b1faebce957..21384457b16 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -71,86 +71,6 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-/* This function should be used to compute a modified ray start position for
- * rays leaving from a surface. The algorithm slightly distorts flat surface
- * of a triangle. Surface is lifted by amount h along normal n in the incident
- * point. */
-
-ccl_device_inline float3 smooth_surface_offset(KernelGlobals *kg, ShaderData *sd, float3 Ng)
-{
-  float3 V[3], N[3];
-  triangle_vertices_and_normals(kg, sd->prim, V, N);
-
-  const float u = sd->u, v = sd->v;
-  const float w = 1 - u - v;
-  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
-  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
-
-  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
-
-  /* Parabolic approximation */
-  float a = dot(N[2] - N[0], V[0] - V[2]);
-  float b = dot(N[2] - N[1], V[1] - V[2]);
-  float c = dot(N[1] - N[0], V[1] - V[0]);
-  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
-
-  /* Check flipped normals */
-  if (dot(n, Ng) > 0) {
-    /* Local linear envelope */
-    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
-    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
-    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
-    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
-    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
-    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
-    h = max(min(min(h0, h1), h2), h * 0.5f);
-  }
-  else {
-    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
-    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
-    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
-    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
-    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
-    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
-    h = min(-min(min(h0, h1), h2), h * 0.5f);
-  }
-
-  return n * h;
-}
-
-/* Ray offset to avoid shadow terminator artifact. */
-
-ccl_device_inline float3 ray_offset_shadow(KernelGlobals *kg, ShaderData *sd, float3 L)
-{
-  float NL = dot(sd->N, L);
-  bool transmit = (NL < 0.0f);
-  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
-  float3 P = ray_offset(sd->P, Ng);
-
-  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
-    const float offset_cutoff =
-        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
-    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
-     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
-     * make a smooth transition near the threshold. */
-    if (offset_cutoff > 0.0f) {
-      float NgL = dot(Ng, L);
-      float offset_amount = 0.0f;
-      if (NL < offset_cutoff) {
-        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
-      }
-      else {
-        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
-      }
-      if (offset_amount > 0.0f) {
-        P += smooth_surface_offset(kg, sd, Ng) * offset_amount;
-      }
-    }
-  }
-
-  return P;
-}
-
 #if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
@@ -193,10 +113,10 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
 }
 #endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
-/* Utility to quickly get a shader flags from an intersection. */
+/* Utility to quickly get flags from an intersection. */
 
-ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_restrict kg,
-                                                         const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
 {
   const int prim = kernel_tex_fetch(__prim_index, isect->prim);
   int shader = 0;
@@ -217,14 +137,14 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_rest
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
 }
 
-ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict kg,
-                                                   const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_from_isect_prim(
+    const KernelGlobals *ccl_restrict kg, const int isect_prim)
 {
-  const int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  const int prim = kernel_tex_fetch(__prim_index, isect_prim);
   int shader = 0;
 
 #ifdef __HAIR__
-  if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE)
+  if (kernel_tex_fetch(__prim_type, isect_prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
   {
     shader = kernel_tex_fetch(__tri_shader, prim);
@@ -239,7 +159,13 @@ ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict k
   return shader & SHADER_MASK;
 }
 
-ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict kg,
+ccl_device_forceinline int intersection_get_shader(const KernelGlobals *ccl_restrict kg,
+                                                   const Intersection *ccl_restrict isect)
+{
+  return intersection_get_shader_from_isect_prim(kg, isect->prim);
+}
+
+ccl_device_forceinline int intersection_get_object(const KernelGlobals *ccl_restrict kg,
                                                    const Intersection *ccl_restrict isect)
 {
   if (isect->object != OBJECT_NONE) {
@@ -249,4 +175,12 @@ ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict k
   return kernel_tex_fetch(__prim_object, isect->prim);
 }
 
+ccl_device_forceinline int intersection_get_object_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
+{
+  const int object = intersection_get_object(kg, isect);
+
+  return kernel_tex_fetch(__object_flag, object);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 1f2ea47269b..0411d9c522d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect,
                                      const uint visibility)
@@ -147,7 +147,7 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+                triangle_intersect(kg, isect, P, dir, isect->t, visibility, object, prim_addr);
               }
               break;
             }
@@ -165,7 +165,7 @@ ccl_device_inline
                   continue;
                 }
                 motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr);
               }
               break;
             }
@@ -181,10 +181,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+            isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+            isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             ++stack_ptr;
@@ -222,7 +221,7 @@ ccl_device_inline
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index a8664cc4331..4874270f15d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    uint BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint max_hits,
@@ -150,7 +150,8 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect_array, P, dir, isect_t, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -190,7 +191,7 @@ ccl_device_inline
                   continue;
                 }
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect_array, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -228,10 +229,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+            isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+            isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             num_hits_in_instance = 0;
@@ -289,7 +289,7 @@ ccl_device_inline
   return num_hits;
 }
 
-ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline uint BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint max_hits,
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index 99a5a675976..72a8c2ba090 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 6f2f2ebb202..4eb8bcae997 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/closure/bsdf_ashikhmin_velvet.h"
 #include "kernel/closure/bsdf_diffuse.h"
@@ -109,7 +111,7 @@ ccl_device_inline float shift_cos_in(float cos_in, const float frequency_multipl
   return val;
 }
 
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_inline int bsdf_sample(const KernelGlobals *kg,
                                   ShaderData *sd,
                                   const ShaderClosure *sc,
                                   float randu,
@@ -429,21 +431,6 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-#endif
     default:
       label = LABEL_NONE;
       break;
@@ -482,15 +469,16 @@ ccl_device
 ccl_device_inline
 #endif
     float3
-    bsdf_eval(KernelGlobals *kg,
+    bsdf_eval(const KernelGlobals *kg,
               ShaderData *sd,
               const ShaderClosure *sc,
               const float3 omega_in,
+              const bool is_transmission,
               float *pdf)
 {
-  float3 eval;
+  float3 eval = zero_float3();
 
-  if (dot(sd->N, omega_in) >= 0.0f) {
+  if (!is_transmission) {
     switch (sc->type) {
       case CLOSURE_BSDF_DIFFUSE_ID:
       case CLOSURE_BSDF_BSSRDF_ID:
@@ -570,13 +558,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -663,13 +645,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -682,7 +658,7 @@ ccl_device_inline
   return eval;
 }
 
-ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
+ccl_device void bsdf_blur(const KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
   /* ToDo: do we want to blur volume closures? */
 #ifdef __SVM__
@@ -715,55 +691,4 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #endif
 }
 
-ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
-{
-#ifdef __SVM__
-  switch (a->type) {
-    case CLOSURE_BSDF_TRANSPARENT_ID:
-      return true;
-    case CLOSURE_BSDF_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_ID:
-    case CLOSURE_BSDF_TRANSLUCENT_ID:
-      return bsdf_diffuse_merge(a, b);
-    case CLOSURE_BSDF_OREN_NAYAR_ID:
-      return bsdf_oren_nayar_merge(a, b);
-    case CLOSURE_BSDF_REFLECTION_ID:
-    case CLOSURE_BSDF_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      return bsdf_microfacet_merge(a, b);
-    case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-      return bsdf_ashikhmin_velvet_merge(a, b);
-    case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-    case CLOSURE_BSDF_GLOSSY_TOON_ID:
-      return bsdf_toon_merge(a, b);
-    case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-    case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-      return bsdf_hair_merge(a, b);
-#  ifdef __PRINCIPLED__
-    case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-      return bsdf_principled_diffuse_merge(a, b);
-#  endif
-#  ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      return volume_henyey_greenstein_merge(a, b);
-#  endif
-    default:
-      return false;
-  }
-#else
-  return false;
-#endif
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 9814a7cf5c9..be6383e521a 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -14,20 +14,19 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
-#define __BSDF_ASHIKHMIN_SHIRLEY_H__
-
 /*
-ASHIKHMIN SHIRLEY BSDF
-
-Implementation of
-Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
-
-The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
-the case with all other microfacet-based BSDF implementations in Cycles.
+ * ASHIKHMIN SHIRLEY BSDF
+ *
+ * Implementation of
+ * Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
+ *
+ * The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
+ * the case with all other microfacet-based BSDF implementations in Cycles.
+ *
+ * Other than that, the implementation directly follows the paper.
+ */
 
-Other than that, the implementation directly follows the paper.
-*/
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -240,5 +239,3 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3d3f20edab3..f51027f5701 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -30,8 +30,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_ASHIKHMIN_VELVET_H__
-#define __BSDF_ASHIKHMIN_VELVET_H__
+#pragma once
+
+#include "kernel/kernel_montecarlo.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -54,14 +55,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_ashikhmin_velvet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const VelvetBsdf *bsdf_a = (const VelvetBsdf *)a;
-  const VelvetBsdf *bsdf_b = (const VelvetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->sigma == bsdf_b->sigma);
-}
-
 ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc,
                                                      const float3 I,
                                                      const float3 omega_in,
@@ -175,5 +168,3 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index ea604ed0311..1555aa30304 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_H__
-#define __BSDF_DIFFUSE_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -49,14 +48,6 @@ ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const DiffuseBsdf *bsdf_a = (const DiffuseBsdf *)a;
-  const DiffuseBsdf *bsdf_b = (const DiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N));
-}
-
 ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc,
                                             const float3 I,
                                             const float3 omega_in,
@@ -174,5 +165,3 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index aa62c1c7ceb..b06dd196b9e 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_RAMP_H__
-#define __BSDF_DIFFUSE_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -125,5 +124,3 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 7ca9424b815..f56f78aa1f0 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_HAIR_H__
-#define __BSDF_HAIR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,15 +61,6 @@ ccl_device int bsdf_hair_transmission_setup(HairBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_hair_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HairBsdf *bsdf_a = (const HairBsdf *)a;
-  const HairBsdf *bsdf_b = (const HairBsdf *)b;
-
-  return (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->roughness1 == bsdf_b->roughness1) &&
-         (bsdf_a->roughness2 == bsdf_b->roughness2) && (bsdf_a->offset == bsdf_b->offset);
-}
-
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc,
                                                     const float3 I,
                                                     const float3 omega_in,
@@ -309,5 +299,3 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index f12661b3095..bfe56e5ab0e 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #ifdef __KERNEL_CPU__
 #  include <fenv.h>
 #endif
 
 #include "kernel/kernel_color.h"
 
-#ifndef __BSDF_HAIR_PRINCIPLED_H__
-#  define __BSDF_HAIR_PRINCIPLED_H__
-
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledHairExtra {
@@ -181,12 +180,12 @@ ccl_device_inline float longitudinal_scattering(
 }
 
 /* Combine the three values using their luminances. */
-ccl_device_inline float4 combine_with_energy(KernelGlobals *kg, float3 c)
+ccl_device_inline float4 combine_with_energy(const KernelGlobals *kg, float3 c)
 {
   return make_float4(c.x, c.y, c.z, linear_rgb_to_gray(kg, c));
 }
 
-#  ifdef __HAIR__
+#ifdef __HAIR__
 /* Set up the hair closure. */
 ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bsdf)
 {
@@ -226,10 +225,10 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
 
-#  endif /* __HAIR__ */
+#endif /* __HAIR__ */
 
 /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
-ccl_device_inline void hair_attenuation(KernelGlobals *kg, float f, float3 T, float4 *Ap)
+ccl_device_inline void hair_attenuation(const KernelGlobals *kg, float f, float3 T, float4 *Ap)
 {
   /* Primary specular (R). */
   Ap[0] = make_float4(f, f, f, f);
@@ -278,7 +277,7 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i,
 }
 
 /* Evaluation function for our shader. */
-ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
+ccl_device float3 bsdf_principled_hair_eval(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const ShaderClosure *sc,
                                             const float3 omega_in,
@@ -356,7 +355,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
 }
 
 /* Sampling function for the hair shader. */
-ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
+ccl_device int bsdf_principled_hair_sample(const KernelGlobals *kg,
                                            const ShaderClosure *sc,
                                            ShaderData *sd,
                                            float randu,
@@ -473,11 +472,11 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
 
   *omega_in = X * sin_theta_i + Y * cos_theta_i * cosf(phi_i) + Z * cos_theta_i * sinf(phi_i);
 
-#  ifdef __RAY_DIFFERENTIALS__
+#ifdef __RAY_DIFFERENTIALS__
   float3 N = safe_normalize(sd->I + *omega_in);
   *domega_in_dx = (2 * dot(N, sd->dI.dx)) * N - sd->dI.dx;
   *domega_in_dy = (2 * dot(N, sd->dI.dy)) * N - sd->dI.dy;
-#  endif
+#endif
 
   return LABEL_GLOSSY | ((p == 0) ? LABEL_REFLECT : LABEL_TRANSMIT);
 }
@@ -501,7 +500,7 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
   return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }
 
-ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc)
+ccl_device float3 bsdf_principled_hair_albedo(const ShaderClosure *sc)
 {
   PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc;
   return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
@@ -523,5 +522,3 @@ ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const flo
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index af03bab39f7..227cb448b47 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -30,8 +30,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_MICROFACET_H__
-#define __BSDF_MICROFACET_H__
+#pragma once
+
+#include "kernel/kernel_lookup_table.h"
+#include "kernel/kernel_random.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,7 +55,7 @@ static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf i
 
 /* Beckmann and GGX microfacet importance sampling. */
 
-ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals *kg,
+ccl_device_inline void microfacet_beckmann_sample_slopes(const KernelGlobals *kg,
                                                          const float cos_theta_i,
                                                          const float sin_theta_i,
                                                          float randu,
@@ -193,7 +195,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i,
   *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x));
 }
 
-ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals *kg,
+ccl_device_forceinline float3 microfacet_sample_stretched(const KernelGlobals *kg,
                                                           const float3 omega_i,
                                                           const float alpha_x,
                                                           const float alpha_y,
@@ -352,21 +354,6 @@ ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const S
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf *)a;
-  const MicrofacetBsdf *bsdf_b = (const MicrofacetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->alpha_x == bsdf_b->alpha_x) &&
-         (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
-         (bsdf_a->ior == bsdf_b->ior) &&
-         ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
-          ((bsdf_a->extra && bsdf_b->extra) &&
-           (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) &&
-           (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) &&
-           (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat)));
-}
-
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->extra = NULL;
@@ -558,7 +545,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc,
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_ggx_sample(const KernelGlobals *kg,
                                           const ShaderClosure *sc,
                                           float3 Ng,
                                           float3 I,
@@ -986,7 +973,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_beckmann_sample(const KernelGlobals *kg,
                                                const ShaderClosure *sc,
                                                float3 Ng,
                                                float3 I,
@@ -1175,5 +1162,3 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_MICROFACET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 9795c8da065..68d5071dbce 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Most of the code is based on the supplemental implementations from
@@ -466,7 +468,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
                         bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_sample(const KernelGlobals *kg,
                                                 const ShaderClosure *sc,
                                                 float3 Ng,
                                                 float3 I,
@@ -628,7 +630,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
                        bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_glass_sample(const KernelGlobals *kg,
                                                       const ShaderClosure *sc,
                                                       float3 Ng,
                                                       float3 I,
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 41e5736bf49..be12d47f0ea 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_OREN_NAYAR_H__
-#define __BSDF_OREN_NAYAR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -61,14 +60,6 @@ ccl_device int bsdf_oren_nayar_setup(OrenNayarBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_oren_nayar_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const OrenNayarBsdf *bsdf_a = (const OrenNayarBsdf *)a;
-  const OrenNayarBsdf *bsdf_b = (const OrenNayarBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc,
                                                const float3 I,
                                                const float3 omega_in,
@@ -127,5 +118,3 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_OREN_NAYAR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index cf5484383f2..43f8cf71c59 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_PHONG_RAMP_H__
-#define __BSDF_PHONG_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -153,5 +152,3 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PHONG_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index d5d012068ff..a72af519482 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
-#define __BSDF_PRINCIPLED_DIFFUSE_H__
+#pragma once
 
 /* DISNEY PRINCIPLED DIFFUSE BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledDiffuseBsdf {
@@ -61,14 +62,6 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf *)a;
-  const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc,
                                                        const float3 I,
                                                        const float3 omega_in,
@@ -136,5 +129,3 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 3707de29d73..60ce7e4eb75 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_SHEEN_H__
-#define __BSDF_PRINCIPLED_SHEEN_H__
+#pragma once
 
 /* DISNEY PRINCIPLED SHEEN BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledSheenBsdf {
@@ -137,5 +138,3 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index c24ba170915..31283971d5a 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFLECTION_H__
-#define __BSDF_REFLECTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -93,5 +92,3 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFLECTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index d4fbe86dac0..cfedb5dfe2c 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFRACTION_H__
-#define __BSDF_REFRACTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -111,5 +110,3 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFRACTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index cc5de21ed0e..acdafe0f735 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TOON_H__
-#define __BSDF_TOON_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -55,15 +54,6 @@ ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_toon_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const ToonBsdf *bsdf_a = (const ToonBsdf *)a;
-  const ToonBsdf *bsdf_b = (const ToonBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->size == bsdf_b->size) &&
-         (bsdf_a->smooth == bsdf_b->smooth);
-}
-
 ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
   float is;
@@ -248,5 +238,3 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TOON_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 4e5513499e8..f1dc7efb345 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TRANSPARENT_H__
-#define __BSDF_TRANSPARENT_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -123,5 +122,3 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TRANSPARENT_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index a73dee1b045..beec5f768a1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_UTIL_H__
-#define __BSDF_UTIL_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -150,5 +149,3 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 562daf1286d..0f9278bba89 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_BSSRDF_H__
-#define __KERNEL_BSSRDF_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,310 +23,71 @@ typedef ccl_addr_space struct Bssrdf {
 
   float3 radius;
   float3 albedo;
-  float sharpness;
-  float texture_blur;
   float roughness;
-  float channels;
+  float anisotropy;
 } Bssrdf;
 
 static_assert(sizeof(ShaderClosure) >= sizeof(Bssrdf), "Bssrdf is too large!");
 
-/* Planar Truncated Gaussian
- *
- * Note how this is different from the typical gaussian, this one integrates
- * to 1 over the plane (where you get an extra 2*pi*x factor). We are lucky
- * that integrating x*exp(-x) gives a nice closed form solution. */
-
-/* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
-#define GAUSS_TRUNCATE 12.46f
-
-ccl_device float bssrdf_gaussian_eval(const float radius, float r)
-{
-  /* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
-   * = 1 - exp(-Rm*Rm/(2*v)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  if (r >= Rm)
-    return 0.0f;
-
-  return expf(-r * r / (2.0f * v)) / (2.0f * M_PI_F * v);
-}
-
-ccl_device float bssrdf_gaussian_pdf(const float radius, float r)
+ccl_device float bssrdf_dipole_compute_Rd(float alpha_prime, float fourthirdA)
 {
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  return bssrdf_gaussian_eval(radius, r) * (1.0f / (area_truncated));
+  float s = sqrtf(3.0f * (1.0f - alpha_prime));
+  return 0.5f * alpha_prime * (1.0f + expf(-fourthirdA * s)) * expf(-s);
 }
 
-ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h)
+ccl_device float bssrdf_dipole_compute_alpha_prime(float rd, float fourthirdA)
 {
-  /* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
-   * r = sqrt(-2*v*logf(xi)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  /* r(xi) */
-  const float r_squared = -2.0f * v * logf(1.0f - xi * area_truncated);
-  *r = sqrtf(r_squared);
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_squared);
-}
-
-/* Planar Cubic BSSRDF falloff
- *
- * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
- * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
- * far as I can tell has no closed form solution. So we get an iterative solution
- * instead with newton-raphson. */
-
-ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r)
-{
-  if (sharpness == 0.0f) {
-    const float Rm = radius;
-
-    if (r >= Rm)
-      return 0.0f;
-
-    /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
-    const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
-    const float f = Rm - r;
-    const float num = f * f * f;
-
-    return (10.0f * num) / (Rm5 * M_PI_F);
+  /* Little Newton solver. */
+  if (rd < 1e-4f) {
+    return 0.0f;
+  }
+  if (rd >= 0.995f) {
+    return 0.999999f;
   }
-  else {
-    float Rm = radius * (1.0f + sharpness);
-
-    if (r >= Rm)
-      return 0.0f;
 
-    /* custom variation with extra sharpness, to match the previous code */
-    const float y = 1.0f / (1.0f + sharpness);
-    float Rmy, ry, ryinv;
+  float x0 = 0.0f;
+  float x1 = 1.0f;
+  float xmid, fmid;
 
-    if (sharpness == 1.0f) {
-      Rmy = sqrtf(Rm);
-      ry = sqrtf(r);
-      ryinv = (ry > 0.0f) ? 1.0f / ry : 0.0f;
+  constexpr const int max_num_iterations = 12;
+  for (int i = 0; i < max_num_iterations; ++i) {
+    xmid = 0.5f * (x0 + x1);
+    fmid = bssrdf_dipole_compute_Rd(xmid, fourthirdA);
+    if (fmid < rd) {
+      x0 = xmid;
     }
     else {
-      Rmy = powf(Rm, y);
-      ry = powf(r, y);
-      ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f;
+      x1 = xmid;
     }
-
-    const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy;
-    const float f = Rmy - ry;
-    const float num = f * (f * f) * (y * ryinv);
-
-    return (10.0f * num) / (Rmy5 * M_PI_F);
-  }
-}
-
-ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r)
-{
-  return bssrdf_cubic_eval(radius, sharpness, r);
-}
-
-/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
-{
-  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
-   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
-   * should not be too bad */
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  float x = 0.25f;
-  int i;
-
-  for (i = 0; i < max_iteration_count; i++) {
-    float x2 = x * x;
-    float x3 = x2 * x;
-    float nx = (1.0f - x);
-
-    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
-    float f_ = 20.0f * (x * nx) * (nx * nx);
-
-    if (fabsf(f) < tolerance || f_ == 0.0f)
-      break;
-
-    x = saturate(x - f / f_);
   }
 
-  return x;
+  return xmid;
 }
 
-ccl_device void bssrdf_cubic_sample(
-    const float radius, const float sharpness, float xi, float *r, float *h)
+ccl_device void bssrdf_setup_radius(Bssrdf *bssrdf, const ClosureType type, const float eta)
 {
-  float Rm = radius;
-  float r_ = bssrdf_cubic_quintic_root_find(xi);
-
-  if (sharpness != 0.0f) {
-    r_ = powf(r_, 1.0f + sharpness);
-    Rm *= (1.0f + sharpness);
-  }
-
-  r_ *= Rm;
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Approximate Reflectance Profiles
- * http://graphics.pixar.com/library/ApproxBSSRDF/paper.pdf
- */
-
-/* This is a bit arbitrary, just need big enough radius so it matches
- * the mean free length, but still not too big so sampling is still
- * effective. Might need some further tweaks.
- */
-#define BURLEY_TRUNCATE 16.0f
-#define BURLEY_TRUNCATE_CDF 0.9963790093708328f  // cdf(BURLEY_TRUNCATE)
-
-ccl_device_inline float bssrdf_burley_fitting(float A)
-{
-  /* Diffuse surface transmission, equation (6). */
-  return 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
-}
-
-/* Scale mean free path length so it gives similar looking result
- * to Cubic and Gaussian models.
- */
-ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
-{
-  return 0.25f * M_1_PI_F * r;
-}
-
-ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf)
-{
-  /* Mean free path length. */
-  const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius);
-  /* Surface albedo. */
-  const float3 A = bssrdf->albedo;
-  const float3 s = make_float3(
-      bssrdf_burley_fitting(A.x), bssrdf_burley_fitting(A.y), bssrdf_burley_fitting(A.z));
-
-  bssrdf->radius = l / s;
-}
-
-ccl_device float bssrdf_burley_eval(const float d, float r)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-
-  if (r >= Rm)
-    return 0.0f;
-
-  /* Burley reflectance profile, equation (3).
-   *
-   * NOTES:
-   * - Surface albedo is already included into sc->weight, no need to
-   *   multiply by this term here.
-   * - This is normalized diffuse model, so the equation is multiplied
-   *   by 2*pi, which also matches cdf().
-   */
-  float exp_r_3_d = expf(-r / (3.0f * d));
-  float exp_r_d = exp_r_3_d * exp_r_3_d * exp_r_3_d;
-  return (exp_r_d + exp_r_3_d) / (4.0f * d);
-}
-
-ccl_device float bssrdf_burley_pdf(const float d, float r)
-{
-  return bssrdf_burley_eval(d, r) * (1.0f / BURLEY_TRUNCATE_CDF);
-}
-
-/* Find the radius for desired CDF value.
- * Returns scaled radius, meaning the result is to be scaled up by d.
- * Since there's no closed form solution we do Newton-Raphson method to find it.
- */
-ccl_device_forceinline float bssrdf_burley_root_find(float xi)
-{
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  /* Do initial guess based on manual curve fitting, this allows us to reduce
-   * number of iterations to maximum 4 across the [0..1] range. We keep maximum
-   * number of iteration higher just to be sure we didn't miss root in some
-   * corner case.
-   */
-  float r;
-  if (xi <= 0.9f) {
-    r = expf(xi * xi * 2.4f) - 1.0f;
+  if (type == CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) {
+    /* Scale mean free path length so it gives similar looking result to older
+     * Cubic, Gaussian and Burley models. */
+    bssrdf->radius *= 0.25f * M_1_PI_F;
   }
   else {
-    /* TODO(sergey): Some nicer curve fit is possible here. */
-    r = 15.0f;
-  }
-  /* Solve against scaled radius. */
-  for (int i = 0; i < max_iteration_count; i++) {
-    float exp_r_3 = expf(-r / 3.0f);
-    float exp_r = exp_r_3 * exp_r_3 * exp_r_3;
-    float f = 1.0f - 0.25f * exp_r - 0.75f * exp_r_3 - xi;
-    float f_ = 0.25f * exp_r + 0.25f * exp_r_3;
+    /* Adjust radius based on IOR and albedo. */
+    const float inv_eta = 1.0f / eta;
+    const float F_dr = inv_eta * (-1.440f * inv_eta + 0.710f) + 0.668f + 0.0636f * eta;
+    const float fourthirdA = (4.0f / 3.0f) * (1.0f + F_dr) /
+                             (1.0f - F_dr); /* From Jensen's Fdr ratio formula. */
 
-    if (fabsf(f) < tolerance || f_ == 0.0f) {
-      break;
-    }
+    const float3 alpha_prime = make_float3(
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.x, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.y, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.z, fourthirdA));
 
-    r = r - f / f_;
-    if (r < 0.0f) {
-      r = 0.0f;
-    }
+    bssrdf->radius *= sqrt(3.0f * (one_float3() - alpha_prime));
   }
-  return r;
 }
 
-ccl_device void bssrdf_burley_sample(const float d, float xi, float *r, float *h)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-  const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* None BSSRDF falloff
- *
- * Samples distributed over disk with no falloff, for reference. */
-
-ccl_device float bssrdf_none_eval(const float radius, float r)
-{
-  const float Rm = radius;
-  return (r < Rm) ? 1.0f : 0.0f;
-}
-
-ccl_device float bssrdf_none_pdf(const float radius, float r)
-{
-  /* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
-  const float Rm = radius;
-  const float area = (M_PI_F * Rm * Rm);
-
-  return bssrdf_none_eval(radius, r) / area;
-}
-
-ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h)
-{
-  /* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
-   * r = sqrt(xi)*Rm */
-  const float Rm = radius;
-  const float r_ = sqrtf(xi) * Rm;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Generic */
+/* Setup */
 
 ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
 {
@@ -342,7 +102,7 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
   return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL;
 }
 
-ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
+ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type, const float ior)
 {
   int flag = 0;
   int bssrdf_channels = 3;
@@ -371,7 +131,7 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   if (bssrdf_channels < 3) {
     /* Add diffuse BSDF if any radius too small. */
 #ifdef __PRINCIPLED__
-    if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
+    if (bssrdf->roughness != FLT_MAX) {
       float roughness = bssrdf->roughness;
       float3 N = bssrdf->N;
 
@@ -401,16 +161,9 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   /* Setup BSSRDF if radius is large enough. */
   if (bssrdf_channels > 0) {
     bssrdf->type = type;
-    bssrdf->channels = bssrdf_channels;
-    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels;
-    bssrdf->texture_blur = saturate(bssrdf->texture_blur);
-    bssrdf->sharpness = saturate(bssrdf->sharpness);
+    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf_channels;
 
-    if (type == CLOSURE_BSSRDF_BURLEY_ID || type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
-        type == CLOSURE_BSSRDF_RANDOM_WALK_ID ||
-        type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-      bssrdf_burley_setup(bssrdf);
-    }
+    bssrdf_setup_radius(bssrdf, type, ior);
 
     flag |= SD_BSSRDF;
   }
@@ -422,77 +175,4 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   return flag;
 }
 
-ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float radius;
-
-  /* Sample color channel and reuse random number. Only a subset of channels
-   * may be used if their radius was too small to handle as BSSRDF. */
-  xi *= bssrdf->channels;
-
-  if (xi < 1.0f) {
-    radius = (bssrdf->radius.x > 0.0f) ? bssrdf->radius.x :
-             (bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                         bssrdf->radius.z;
-  }
-  else if (xi < 2.0f) {
-    xi -= 1.0f;
-    radius = (bssrdf->radius.x > 0.0f && bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                                                    bssrdf->radius.z;
-  }
-  else {
-    xi -= 2.0f;
-    radius = bssrdf->radius.z;
-  }
-
-  /* Sample BSSRDF. */
-  if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    bssrdf_gaussian_sample(radius, xi, r, h);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) */
-    bssrdf_burley_sample(radius, xi, r, h);
-  }
-}
-
-ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r)
-{
-  if (radius == 0.0f) {
-    return 0.0f;
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    return bssrdf_gaussian_pdf(radius, r);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
-    return bssrdf_burley_pdf(radius, r);
-  }
-}
-
-ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-
-  return make_float3(bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r));
-}
-
-ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float3 pdf = bssrdf_eval(sc, r);
-
-  return (pdf.x + pdf.y + pdf.z) / bssrdf->channels;
-}
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_BSSRDF_H__ */
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index 911382e6865..a2519d97618 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -30,6 +30,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* BACKGROUND CLOSURE */
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 1430f712701..69959a3f21b 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __VOLUME_H__
-#define __VOLUME_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,21 +61,12 @@ ccl_device int volume_henyey_greenstein_setup(HenyeyGreensteinVolume *volume)
   return SD_SCATTER;
 }
 
-ccl_device bool volume_henyey_greenstein_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HenyeyGreensteinVolume *volume_a = (const HenyeyGreensteinVolume *)a;
-  const HenyeyGreensteinVolume *volume_b = (const HenyeyGreensteinVolume *)b;
-
-  return (volume_a->g == volume_b->g);
-}
-
-ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc,
+ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderVolumeClosure *svc,
                                                       const float3 I,
                                                       float3 omega_in,
                                                       float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer */
   if (fabsf(g) < 1e-3f) {
@@ -122,7 +112,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pd
   return dir;
 }
 
-ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
+ccl_device int volume_henyey_greenstein_sample(const ShaderVolumeClosure *svc,
                                                float3 I,
                                                float3 dIdx,
                                                float3 dIdy,
@@ -134,8 +124,7 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
                                                float3 *domega_in_dy,
                                                float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer and so is used negated */
   *omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf);
@@ -153,17 +142,15 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
 /* VOLUME CLOSURE */
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd,
-                                    const ShaderClosure *sc,
+                                    const ShaderVolumeClosure *svc,
                                     float3 omega_in,
                                     float *pdf)
 {
-  kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
-
-  return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+  return volume_henyey_greenstein_eval_phase(svc, sd->I, omega_in, pdf);
 }
 
 ccl_device int volume_phase_sample(const ShaderData *sd,
-                                   const ShaderClosure *sc,
+                                   const ShaderVolumeClosure *svc,
                                    float randu,
                                    float randv,
                                    float3 *eval,
@@ -171,31 +158,65 @@ ccl_device int volume_phase_sample(const ShaderData *sd,
                                    differential3 *domega_in,
                                    float *pdf)
 {
-  int label;
-
-  switch (sc->type) {
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-    default:
-      *eval = make_float3(0.0f, 0.0f, 0.0f);
-      label = LABEL_NONE;
-      break;
+  return volume_henyey_greenstein_sample(svc,
+                                         sd->I,
+                                         sd->dI.dx,
+                                         sd->dI.dy,
+                                         randu,
+                                         randv,
+                                         eval,
+                                         omega_in,
+                                         &domega_in->dx,
+                                         &domega_in->dy,
+                                         pdf);
+}
+
+/* Volume sampling utilities. */
+
+/* todo: this value could be tweaked or turned into a probability to avoid
+ * unnecessary work in volumes and subsurface scattering. */
+#define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+ccl_device float3 volume_color_transmittance(float3 sigma, float t)
+{
+  return exp3(-sigma * t);
+}
+
+ccl_device float volume_channel_get(float3 value, int channel)
+{
+  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
+}
+
+ccl_device int volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf)
+{
+  /* Sample color channel proportional to throughput and single scattering
+   * albedo, to significantly reduce noise with many bounce, following:
+   *
+   * "Practical and Controllable Subsurface Scattering for Production Path
+   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+  float3 weights = fabs(throughput * albedo);
+  float sum_weights = weights.x + weights.y + weights.z;
+  float3 weights_pdf;
+
+  if (sum_weights > 0.0f) {
+    weights_pdf = weights / sum_weights;
   }
+  else {
+    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
+  }
+
+  *pdf = weights_pdf;
 
-  return label;
+  /* OpenCL does not support -> on float3, so don't use pdf->x. */
+  if (rand < weights_pdf.x) {
+    return 0;
+  }
+  else if (rand < weights_pdf.x + weights_pdf.y) {
+    return 1;
+  }
+  else {
+    return 2;
+  }
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/device/cpu/compat.h
index 88f6a264a5a..bfd936c7bbd 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CPU_H__
-#define __KERNEL_COMPAT_CPU_H__
+#pragma once
 
 #define __KERNEL_CPU__
 
@@ -27,14 +26,6 @@
 #  pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #include "util/util_half.h"
 #include "util/util_math.h"
 #include "util/util_simd.h"
@@ -43,15 +34,6 @@
 
 #define ccl_addr_space
 
-#define ccl_local_id(d) 0
-#define ccl_global_id(d) (kg->global_id[d])
-
-#define ccl_local_size(d) 1
-#define ccl_global_size(d) (kg->global_size[d])
-
-#define ccl_group_id(d) ccl_global_id(d)
-#define ccl_num_groups(d) ccl_global_size(d)
-
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -72,37 +54,11 @@ CCL_NAMESPACE_BEGIN
  * simple arrays and after inlining fetch hopefully revert to being a simple
  * pointer lookup. */
 template<typename T> struct texture {
-  ccl_always_inline const T &fetch(int index)
+  ccl_always_inline const T &fetch(int index) const
   {
     kernel_assert(index >= 0 && index < width);
     return data[index];
   }
-#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-  /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
-   * compatibility with existing indices and data structures.
-   */
-  ccl_always_inline avxf fetch_avxf(const int index)
-  {
-    kernel_assert(index >= 0 && (index + 1) < width);
-    ssef *ssef_data = (ssef *)data;
-    ssef *ssef_node_data = &ssef_data[index];
-    return _mm256_loadu_ps((float *)ssef_node_data);
-  }
-#endif
-
-#ifdef __KERNEL_SSE2__
-  ccl_always_inline ssef fetch_ssef(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssef *)data)[index];
-  }
-
-  ccl_always_inline ssei fetch_ssei(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssei *)data)[index];
-  }
-#endif
 
   T *data;
   int width;
@@ -110,15 +66,6 @@ template<typename T> struct texture {
 
 /* Macros to handle different memory storage on different devices */
 
-#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
-#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
-#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
-#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-#define kernel_tex_array(tex) (kg->tex.data)
-
-#define kernel_data (kg->__data)
-
 #ifdef __KERNEL_SSE2__
 typedef vector3<sseb> sse3b;
 typedef vector3<ssef> sse3f;
@@ -152,5 +99,3 @@ typedef vector3<avxf> avx3f;
 #endif
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+  KernelData __data;
+
+#ifdef __OSL__
+  /* On the CPU, we also have the OSL globals here. Most data structures are shared
+   * with SVM, the difference is in the shaders and object/mesh attributes. */
+  OSLGlobals *osl;
+  OSLShadingSystem *osl_ss;
+  OSLThreadData *osl_tdata;
+#endif
+
+  /* **** Run-time data ****  */
+
+  ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/device/cpu/image.h
index 59b96c86c50..57e81ab186d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_CPU_IMAGE_H__
-#define __KERNEL_CPU_IMAGE_H__
+#pragma once
 
 #ifdef WITH_NANOVDB
 #  define NANOVDB_USE_INTRINSICS
@@ -584,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
 
 #undef SET_CUBIC_SPLINE_WEIGHTS
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -612,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -656,5 +655,3 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
 } /* Namespace. */
 
 CCL_NAMESPACE_END
-
-#endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 8040bfb7b33..ac1cdf5fffe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -56,9 +56,9 @@
 /* do nothing */
 #endif
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index b907c6a2bac..ae2a841835a 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -14,50 +14,49 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_H__
-#define __KERNEL_H__
+#pragma once
 
 /* CPU Kernel Interface */
 
-#include "kernel/kernel_types.h"
 #include "util/util_types.h"
 
+#include "kernel/kernel_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 #define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
 #define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
+struct IntegratorStateCPU;
 struct KernelGlobals;
 struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
 
-void *kernel_osl_memory(KernelGlobals *kg);
-bool kernel_osl_use(KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
 void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+#    include "kernel/device/cpu/globals.h"
+#    include "kernel/device/cpu/image.h"
+
+#    include "kernel/integrator/integrator_state.h"
+#    include "kernel/integrator/integrator_state_flow.h"
+#    include "kernel/integrator/integrator_state_util.h"
+
+#    include "kernel/integrator/integrator_init_from_camera.h"
+#    include "kernel/integrator/integrator_init_from_bake.h"
+#    include "kernel/integrator/integrator_intersect_closest.h"
+#    include "kernel/integrator/integrator_intersect_shadow.h"
+#    include "kernel/integrator/integrator_intersect_subsurface.h"
+#    include "kernel/integrator/integrator_intersect_volume_stack.h"
+#    include "kernel/integrator/integrator_shade_background.h"
+#    include "kernel/integrator/integrator_shade_light.h"
+#    include "kernel/integrator/integrator_shade_shadow.h"
+#    include "kernel/integrator/integrator_shade_surface.h"
+#    include "kernel/integrator/integrator_shade_volume.h"
+#    include "kernel/integrator/integrator_megakernel.h"
+
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_adaptive_sampling.h"
+#    include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+#  define STUB_ASSERT(arch, name) \
+    assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif   /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+#  define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+#  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state) \
+  { \
+    KERNEL_INVOKE(name, kg, state); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, state, render_buffer); \
+  }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+  kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+  kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+  return false;
+#else
+  return kernel_adaptive_sampling_convergence_check(
+      kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+#    ifdef __BAKING__
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#    endif
+#  endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
index 5f6b6800363..220768036ab 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -34,6 +34,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 97e8fc25140..90c05113cbe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -35,6 +35,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
index 26d7fd4de48..fb85ef5b0d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -29,6 +29,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
index 3f259aa4480..87baf04258a 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -31,6 +31,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
index 68bae8c07c6..bb421d58815 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -32,6 +32,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/device/cuda/compat.h
index ea3b78b7cef..665da43e1a1 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -14,20 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CUDA_H__
-#define __KERNEL_COMPAT_CUDA_H__
+#pragma once
 
 #define __KERNEL_GPU__
 #define __KERNEL_CUDA__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
 #endif
 
 /* Manual definitions so we can compile without CUDA toolkit. */
@@ -38,8 +33,6 @@ typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -47,14 +40,7 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Qualifier wrappers for different names on different devices */
+/* Qualifiers */
 
 #define ccl_device __device__ __inline__
 #if __CUDA_ARCH__ < 500
@@ -68,104 +54,61 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local __shared__
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
-/* TODO(sergey): In theory we might use references with CUDA, however
- * performance impact yet to be investigated.
- */
-#define ccl_ref
 #define ccl_align(n) __align__(n)
 #define ccl_optional_struct_init
 
-#define ATTR_FALLTHROUGH
-
-#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH)
-
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
-/* Types */
+/* GPU thread, block, grid size and index */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
 
-/* Work item functions */
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-ccl_device_inline uint ccl_local_id(uint d)
-{
-  switch (d) {
-    case 0:
-      return threadIdx.x;
-    case 1:
-      return threadIdx.y;
-    case 2:
-      return threadIdx.z;
-    default:
-      return 0;
-  }
-}
+/* GPU warp synchronizaton */
 
-#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
 
-ccl_device_inline uint ccl_local_size(uint d)
-{
-  switch (d) {
-    case 0:
-      return blockDim.x;
-    case 1:
-      return blockDim.y;
-    case 2:
-      return blockDim.z;
-    default:
-      return 0;
-  }
-}
+/* GPU texture objects */
 
-#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
 
-ccl_device_inline uint ccl_group_id(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
 {
-  switch (d) {
-    case 0:
-      return blockIdx.x;
-    case 1:
-      return blockIdx.y;
-    case 2:
-      return blockIdx.z;
-    default:
-      return 0;
-  }
+  return tex2D<T>(texobj, x, y);
 }
 
-ccl_device_inline uint ccl_num_groups(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
 {
-  switch (d) {
-    case 0:
-      return gridDim.x;
-    case 1:
-      return gridDim.y;
-    case 2:
-      return gridDim.z;
-    default:
-      return 0;
-  }
+  return tex3D<T>(texobj, x, y, z);
 }
 
-/* Textures */
-
-/* Use arrays for regular data. */
-#define kernel_tex_fetch(t, index) t[(index)]
-#define kernel_tex_array(t) (t)
-
-#define kernel_data __data
-
 /* Use fast math functions */
 
 #define cosf(x) __cosf(((float)(x)))
@@ -175,4 +118,18 @@ ccl_device_inline uint ccl_num_groups(uint d)
 #define logf(x) __logf(((float)(x)))
 #define expf(x) __expf(((float)(x)))
 
-#endif /* __KERNEL_COMPAT_CUDA_H__ */
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+#    define GPU_KERNEL_MAX_REGISTERS 64
+#  else
+#    define GPU_KERNEL_MAX_REGISTERS 48
+#  endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 512
+#  define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..169047175f5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/device/cuda/kernel.cu
index 8afaa686e28..e26fe243642 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2013 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_do_volume.h"
+/* CUDA kernel entry points */
 
-#define KERNEL_NAME do_volume
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#ifdef __CUDA_ARCH__
 
+#  include "kernel/device/cuda/compat.h"
+#  include "kernel/device/cuda/config.h"
+#  include "kernel/device/cuda/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/device/gpu/image.h
index 132653fa7ca..b015c78a8f5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
 #ifdef WITH_NANOVDB
 #  define NDEBUG /* Disable "assert" in device code */
 #  define NANOVDB_USE_INTRINSICS
@@ -61,9 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -81,15 +85,18 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f
   float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
   float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
 
-  return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) +
-         cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1));
+  return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+         cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
 }
 
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -117,10 +124,14 @@ ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x,
   float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
   float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
 
-  return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) +
-         g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1)));
+  return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+         g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
 }
 
 #ifdef WITH_NANOVDB
@@ -157,7 +168,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 }
 
 template<typename T>
-ccl_device_inline T kernel_tex_image_interp_nanovdb(
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
     const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
@@ -178,7 +189,7 @@ ccl_device_inline T kernel_tex_image_interp_nanovdb(
 }
 #endif
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -190,8 +201,8 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       return kernel_tex_image_interp_bicubic<float4>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex2D<float4>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
     }
   }
   /* float, byte and half */
@@ -202,15 +213,15 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       f = kernel_tex_image_interp_bicubic<float>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex2D<float>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -245,8 +256,8 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex3D<float4>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
     }
   }
   else {
@@ -256,10 +267,12 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex3D<float>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..7b79c0aedfa
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_reset(int num_states)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  if (state < num_states) {
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+                                           const int num_tiles,
+                                           float *render_buffer,
+                                           const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+                                         const int num_tiles,
+                                         float *render_buffer,
+                                         const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_closest(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_shadow(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_subsurface(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_volume_stack(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_background(const int *path_index_array,
+                                           float *render_buffer,
+                                           const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_background(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_light(const int *path_index_array,
+                                      float *render_buffer,
+                                      const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_light(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_shadow(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface(const int *path_index_array,
+                                        float *render_buffer,
+                                        const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+                                                 float *render_buffer,
+                                                 const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_volume(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_volume(NULL, state, render_buffer);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_paths_array(int num_states,
+                                             int *indices,
+                                             int *num_indices,
+                                             int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+                                                    int *indices,
+                                                    int *num_indices,
+                                                    int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_paths_array(int num_states,
+                                                 int *indices,
+                                                 int *num_indices,
+                                                 int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_sorted_paths_array(
+        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+{
+  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(path, shader_sort_key) :
+                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_paths_array(int num_states,
+                                              int *indices,
+                                              int *num_indices,
+                                              int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+                                         const int active_states_offset,
+                                         const int terminated_states_offset,
+                                         const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_state_move(to_state, from_state);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_prefix_sum(int *values, int num_values)
+{
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+                                                   int sx,
+                                                   int sy,
+                                                   int sw,
+                                                   int sh,
+                                                   float threshold,
+                                                   bool reset,
+                                                   int offset,
+                                                   int stride,
+                                                   uint *num_active_pixels)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / sw;
+  const int x = work_index - y * sw;
+
+  bool converged = true;
+
+  if (x < sw && y < sh) {
+    converged = kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_x(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int y = ccl_gpu_global_id_x();
+
+  if (y < sh) {
+    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_y(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int x = ccl_gpu_global_id_x();
+
+  if (x < sw) {
+    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+  const int pixel_index = ccl_gpu_global_id_x();
+
+  if (pixel_index < num_pixels) {
+    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+                                                      float *pixels,
+                                                      float *render_buffer,
+                                                      int num_pixels,
+                                                      int width,
+                                                      int offset,
+                                                      int stride,
+                                                      int dst_offset,
+                                                      int dst_stride,
+                                                      const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  ccl_global float *pixel = pixels +
+                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+  processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+
+  float pixel[4];
+  processor(kfilm_convert, buffer, pixel);
+
+  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        processor(kfilm_convert, buffer, pixel_rgba);
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        float value;
+        processor(kfilm_convert, buffer, &value);
+
+        pixel_rgba[0] = value;
+        pixel_rgba[1] = value;
+        pixel_rgba[2] = value;
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+  (const KernelFilmConvert kfilm_convert, \
+   float *pixels, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_common(&kfilm_convert, \
+                                   pixels, \
+                                   render_buffer, \
+                                   num_pixels, \
+                                   width, \
+                                   offset, \
+                                   stride, \
+                                   rgba_offset, \
+                                   rgba_stride, \
+                                   film_get_pass_pixel_##variant); \
+  } \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+  (const KernelFilmConvert kfilm_convert, \
+   uchar4 *rgba, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+                                                        rgba, \
+                                                        render_buffer, \
+                                                        num_pixels, \
+                                                        width, \
+                                                        offset, \
+                                                        stride, \
+                                                        rgba_offset, \
+                                                        rgba_stride, \
+                                                        film_get_pass_pixel_##variant); \
+  }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+                                    float4 *output,
+                                    const int offset,
+                                    const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_displace_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* Background Shader Evaluation */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+                                      float4 *output,
+                                      const int offset,
+                                      const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_background_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_preprocess(float *render_buffer,
+                                       int full_x,
+                                       int full_y,
+                                       int width,
+                                       int height,
+                                       int offset,
+                                       int stride,
+                                       int pass_stride,
+                                       int pass_denoised)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float *color_out = buffer + pass_denoised;
+  color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+  color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+  color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+                                         int guiding_pass_stride,
+                                         int guiding_pass_albedo,
+                                         int guiding_pass_normal,
+                                         const float *render_buffer,
+                                         int render_offset,
+                                         int render_stride,
+                                         int render_pass_stride,
+                                         int render_pass_sample_count,
+                                         int render_pass_denoising_albedo,
+                                         int render_pass_denoising_normal,
+                                         int full_x,
+                                         int full_y,
+                                         int width,
+                                         int height,
+                                         int num_samples)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+  float pixel_scale;
+  if (render_pass_sample_count == PASS_UNUSED) {
+    pixel_scale = 1.0f / num_samples;
+  }
+  else {
+    pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+  }
+
+  /* Albedo pass. */
+  if (guiding_pass_albedo != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+    const float *aledo_in = buffer + render_pass_denoising_albedo;
+    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+    albedo_out[0] = aledo_in[0] * pixel_scale;
+    albedo_out[1] = aledo_in[1] * pixel_scale;
+    albedo_out[2] = aledo_in[2] * pixel_scale;
+  }
+
+  /* Normal pass. */
+  if (render_pass_denoising_normal != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+    const float *normal_in = buffer + render_pass_denoising_normal;
+    float *normal_out = guiding_pixel + guiding_pass_normal;
+
+    normal_out[0] = normal_in[0] * pixel_scale;
+    normal_out[1] = normal_in[1] * pixel_scale;
+    normal_out[2] = normal_in[2] * pixel_scale;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+                                              int guiding_pass_stride,
+                                              int guiding_pass_albedo,
+                                              int width,
+                                              int height)
+{
+  kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+  albedo_out[0] = 0.5f;
+  albedo_out[1] = 0.5f;
+  albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_postprocess(float *render_buffer,
+                                        int full_x,
+                                        int full_y,
+                                        int width,
+                                        int height,
+                                        int offset,
+                                        int stride,
+                                        int pass_stride,
+                                        int num_samples,
+                                        int pass_noisy,
+                                        int pass_denoised,
+                                        int pass_sample_count,
+                                        int num_components,
+                                        bool use_compositing)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float pixel_scale;
+  if (pass_sample_count == PASS_UNUSED) {
+    pixel_scale = num_samples;
+  }
+  else {
+    pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+  }
+
+  float *denoised_pixel = buffer + pass_denoised;
+
+  denoised_pixel[0] *= pixel_scale;
+  denoised_pixel[1] *= pixel_scale;
+  denoised_pixel[2] *= pixel_scale;
+
+  if (num_components == 3) {
+    /* Pass without alpha channel. */
+  }
+  else if (!use_compositing) {
+    /* Currently compositing passes are either 3-component (derived by dividing light passes)
+     * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+     * simplifies logic and avoids extra memory allocation. */
+    const float *noisy_pixel = buffer + pass_noisy;
+    denoised_pixel[3] = noisy_pixel[3];
+  }
+  else {
+    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+     * is an opaque pixel for 4 component passes. */
+
+    denoised_pixel[3] = 0;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+                                                               uint *num_possible_splits)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  bool can_split = false;
+
+  if (state < num_states) {
+    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+  }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..85500bf4d07
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                IsActiveOp is_active_op)
+{
+  extern ccl_gpu_shared int warp_offset[];
+
+  const uint thread_index = ccl_gpu_thread_idx_x;
+  const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+  const uint warp_index = thread_index / ccl_gpu_warp_size;
+  const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+  /* Test if state corresponding to this thread is active. */
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+   * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
+    }
+
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..f609520b8b4
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+{
+  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+    return;
+  }
+
+  int offset = 0;
+  for (int i = 0; i < num_values; i++) {
+    const int new_offset = offset + values[i];
+    values[i] = offset;
+    offset = new_offset;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..65b1990dbb8
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially.  This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+    const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+  extern ccl_gpu_shared OutputT shared_data[];
+
+  const uint tid = ccl_gpu_thread_idx_x;
+  const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+  OutputT sum = zero;
+  for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+    sum += convert(input_data[i]);
+  }
+  shared_data[tid] = sum;
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 512 && tid < 256) {
+    shared_data[tid] = sum = sum + shared_data[tid + 256];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 256 && tid < 128) {
+    shared_data[tid] = sum = sum + shared_data[tid + 128];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 128 && tid < 64) {
+    shared_data[tid] = sum = sum + shared_data[tid + 64];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 64 && tid < 32) {
+    shared_data[tid] = sum = sum + shared_data[tid + 32];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (tid < 32) {
+    for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+      sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+    }
+  }
+
+  if (tid == 0) {
+    output_sum[ccl_gpu_block_idx_x] = sum;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..99b35468517
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                int *key_prefix_sum,
+                                                GetKeyOp get_key_op)
+{
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+  const int key = (state_index < num_states) ? get_key_op(state_index) :
+                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+  if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+    const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+    indices[index] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/device/optix/compat.h
index 064c99ca100..4e255a135c6 100644
--- a/intern/cycles/kernel/kernel_compat_optix.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -15,14 +15,13 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_OPTIX_H__
-#define __KERNEL_COMPAT_OPTIX_H__
+#pragma once
 
 #define OPTIX_DONT_INCLUDE_CUDA
 #include <optix.h>
 
 #define __KERNEL_GPU__
-#define __KERNEL_CUDA__  // OptiX kernels are implicitly CUDA kernels too
+#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
@@ -31,14 +30,14 @@
 #  define ATTR_FALLTHROUGH
 #endif
 
+/* Manual definitions so we can compile without CUDA toolkit. */
+
 #ifdef __CUDACC_RTC__
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -46,21 +45,6 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #define ccl_device \
   __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
@@ -69,29 +53,75 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
-#define ccl_loop_no_unroll
 #define ccl_restrict __restrict__
-#define ccl_ref
+#define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
 
-// Zero initialize structs to help the compiler figure out scoping
+/* Zero initialize structs to help the compiler figure out scoping */
 #define ccl_optional_struct_init = {}
 
-#define kernel_data __params.data  // See kernel_globals.h
-#define kernel_tex_array(t) __params.t
-#define kernel_tex_fetch(t, index) __params.t[(index)]
+/* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
 #include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPTIX_H__ */
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
new file mode 100644
index 00000000000..7d898ed5d91
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Launch parameters */
+struct KernelParamsOptiX {
+  /* Kernel arguments */
+  const int *path_index_array;
+  float *render_buffer;
+
+  /* Global scene data and textures */
+  KernelData data;
+#define KERNEL_TEX(type, name) const type *name;
+#include "kernel/kernel_textures.h"
+
+  /* Integrator state */
+  IntegratorStateGPU __integrator_state;
+};
+
+#ifdef __NVCC__
+extern "C" static __constant__ KernelParamsOptiX __params;
+#endif
+
+/* Abstraction macros */
+#define kernel_data __params.data
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+#define kernel_integrator_state __params.__integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7f609eab474..c1e36febfc0 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -16,14 +16,20 @@
  */
 
 // clang-format off
-#include "kernel/kernel_compat_optix.h"
-#include "util/util_atomic.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "../cuda/kernel_cuda_image.h"  // Texture lookup uses normal CUDA intrinsics
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_bake.h"
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
 // clang-format on
 
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
@@ -53,52 +59,36 @@ template<bool always = false> ccl_device_forceinline uint get_object_id()
     return OBJECT_NONE;
 }
 
-extern "C" __global__ void __raygen__kernel_optix_path_trace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
-  KernelGlobals kg;  // Allocate stack storage for common data
-
-  const uint3 launch_index = optixGetLaunchIndex();
-  // Keep threads for same pixel together to improve occupancy of warps
-  uint pixel_offset = launch_index.x / __params.tile.num_samples;
-  uint sample_offset = launch_index.x % __params.tile.num_samples;
-
-  kernel_path_trace(&kg,
-                    __params.tile.buffer,
-                    __params.tile.start_sample + sample_offset,
-                    __params.tile.x + pixel_offset,
-                    __params.tile.y + launch_index.y,
-                    __params.tile.offset,
-                    __params.tile.stride);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_closest(nullptr, path_index);
 }
 
-#ifdef __BAKING__
-extern "C" __global__ void __raygen__kernel_optix_bake()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_bake_evaluate(&kg,
-                       p.input,
-                       p.output,
-                       (ShaderEvalType)p.type,
-                       p.filter,
-                       p.sx + optixGetLaunchIndex().x,
-                       p.offset,
-                       p.sample);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_shadow(nullptr, path_index);
 }
-#endif
 
-extern "C" __global__ void __raygen__kernel_optix_displace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_subsurface(nullptr, path_index);
 }
 
-extern "C" __global__ void __raygen__kernel_optix_background()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_volume_stack(nullptr, path_index);
 }
 
 extern "C" __global__ void __miss__kernel_optix_miss()
@@ -179,54 +169,91 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 {
 #ifdef __SHADOW_RECORD_ALL__
+  bool ignore_intersection = false;
+
   const uint prim = optixGetPrimitiveIndex();
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
   if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
-    return optixIgnoreIntersection();
+    ignore_intersection = true;
   }
 #  endif
 
-  // Offset into array with num_hits
-  Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2();
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = get_object_id();
-  isect->type = kernel_tex_fetch(__prim_type, prim);
-
+  float u = 0.0f, v = 0.0f;
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
-    isect->u = 1.0f - barycentrics.y - barycentrics.x;
-    isect->v = barycentrics.x;
+    u = 1.0f - barycentrics.y - barycentrics.x;
+    v = barycentrics.x;
   }
 #  ifdef __HAIR__
   else {
-    const float u = __uint_as_float(optixGetAttribute_0());
-    isect->u = u;
-    isect->v = __uint_as_float(optixGetAttribute_1());
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
 
     // Filter out curve endcaps
     if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
+      ignore_intersection = true;
     }
   }
 #  endif
 
+  int num_hits = optixGetPayload_2();
+  int record_index = num_hits;
+  const int max_hits = optixGetPayload_3();
+
+  if (!ignore_intersection) {
+    optixSetPayload_2(num_hits + 1);
+  }
+
+  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+
 #  ifdef __TRANSPARENT_SHADOWS__
-  // Detect if this surface has a shader with transparent shadows
-  if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) {
+  if (num_hits >= max_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    const int num_recorded_hits = min(max_hits, num_hits);
+    float max_recorded_t = isect_array[0].t;
+    int max_recorded_hit = 0;
+
+    for (int i = 1; i < num_recorded_hits; i++) {
+      if (isect_array[i].t > max_recorded_t) {
+        max_recorded_t = isect_array[i].t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
+       * hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
 #  endif
-    // This is an opaque hit or the hit limit has been reached, abort traversal
-    optixSetPayload_5(true);
-    return optixTerminateRay();
+
+  if (!ignore_intersection) {
+    Intersection *const isect = isect_array + record_index;
+    isect->u = u;
+    isect->v = v;
+    isect->t = optixGetRayTmax();
+    isect->prim = prim;
+    isect->object = get_object_id();
+    isect->type = kernel_tex_fetch(__prim_type, prim);
+
+#  ifdef __TRANSPARENT_SHADOWS__
+    // Detect if this surface has a shader with transparent shadows
+    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
+#  endif
+      // If no transparent shadows, all light is blocked and we can stop immediately
+      optixSetPayload_5(true);
+      return optixTerminateRay();
 #  ifdef __TRANSPARENT_SHADOWS__
+    }
+#  endif
   }
 
-  optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++
-
   // Continue tracing
   optixIgnoreIntersection();
-#  endif
 #endif
 }
 
@@ -300,7 +327,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
-  if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
                             __float_as_int(isect.u),   // Attribute_0
@@ -317,11 +344,4 @@ extern "C" __global__ void __intersection__curve_ribbon()
     optix_intersection_curve(prim, type);
   }
 }
-
-extern "C" __global__ void __intersection__curve_all()
-{
-  const uint prim = optixGetPrimitiveIndex();
-  const uint type = kernel_tex_fetch(__prim_type, prim);
-  optix_intersection_curve(prim, type);
-}
 #endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
new file mode 100644
index 00000000000..bf787e29eaa
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2021, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copy of the regular kernels with additional shader ray-tracing kernel that takes
+ * much longer to compiler. This is only loaded when needed by the scene. */
+
+#include "kernel/device/optix/kernel.cu"
+#include "kernel/integrator/integrator_shade_surface.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+}
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
deleted file mode 100644
index b067e53a8bf..00000000000
--- a/intern/cycles/kernel/filter/filter.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_H__
-#define __FILTER_H__
-
-/* CPU Filter Kernel Interface */
-
-#include "util/util_types.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
-#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
-#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
-
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-CCL_NAMESPACE_END
-
-#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
deleted file mode 100644
index 1c0ac5e2cb7..00000000000
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_DEFINES_H__
-#define __FILTER_DEFINES_H__
-
-#define DENOISE_FEATURES 11
-#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES)
-#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2)
-#define XTWY_SIZE (DENOISE_FEATURES + 1)
-
-#define DENOISE_MAX_FRAMES 16
-
-typedef struct TileInfo {
-  int offsets[9];
-  int strides[9];
-  int x[4];
-  int y[4];
-  int from_render;
-  int frames[DENOISE_MAX_FRAMES];
-  int num_frames;
-  /* TODO(lukas): CUDA doesn't have uint64_t... */
-#ifdef __KERNEL_OPENCL__
-  ccl_global float *buffers[9];
-#else
-  long long int buffers[9];
-#endif
-} TileInfo;
-
-#ifdef __KERNEL_OPENCL__
-#  define CCL_FILTER_TILE_INFO \
-    ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \
-        ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \
-        ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \
-        ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \
-        ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9
-#  define CCL_FILTER_TILE_INFO_ARG \
-    tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \
-        tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9
-#  define ccl_get_tile_buffer(id) \
-    (id == 0 ? tile_buffer_1 : \
-     id == 1 ? tile_buffer_2 : \
-     id == 2 ? tile_buffer_3 : \
-     id == 3 ? tile_buffer_4 : \
-     id == 4 ? tile_buffer_5 : \
-     id == 5 ? tile_buffer_6 : \
-     id == 6 ? tile_buffer_7 : \
-     id == 7 ? tile_buffer_8 : \
-               tile_buffer_9)
-#else
-#  ifdef __KERNEL_CUDA__
-#    define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info
-#  else
-#    define CCL_FILTER_TILE_INFO TileInfo *tile_info
-#  endif
-#  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
-#endif
-
-#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
deleted file mode 100644
index 8a2af957146..00000000000
--- a/intern/cycles/kernel/filter/filter_features.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always
- * points to the current pixel in the first pass. Repeat the loop for every secondary frame if
- * there are any. */
-#define FOR_PIXEL_WINDOW \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-
-#define END_FOR_PIXEL_WINDOW \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features(int3 pixel,
-                                           const ccl_global float *ccl_restrict buffer,
-                                           float *features,
-                                           bool use_time,
-                                           const float *ccl_restrict mean,
-                                           int pass_stride)
-{
-  features[0] = pixel.x;
-  features[1] = pixel.y;
-  features[2] = fabsf(ccl_get_feature(buffer, 0));
-  features[3] = ccl_get_feature(buffer, 1);
-  features[4] = ccl_get_feature(buffer, 2);
-  features[5] = ccl_get_feature(buffer, 3);
-  features[6] = ccl_get_feature(buffer, 4);
-  features[7] = ccl_get_feature(buffer, 5);
-  features[8] = ccl_get_feature(buffer, 6);
-  features[9] = ccl_get_feature(buffer, 7);
-  if (use_time) {
-    features[10] = pixel.z;
-  }
-  if (mean) {
-    for (int i = 0; i < (use_time ? 11 : 10); i++) {
-      features[i] -= mean[i];
-    }
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales(int3 pixel,
-                                                 const ccl_global float *ccl_restrict buffer,
-                                                 float *scales,
-                                                 bool use_time,
-                                                 const float *ccl_restrict mean,
-                                                 int pass_stride)
-{
-  scales[0] = fabsf(pixel.x - mean[0]);
-  scales[1] = fabsf(pixel.y - mean[1]);
-  scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
-  scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
-                                      ccl_get_feature(buffer, 2) - mean[4],
-                                      ccl_get_feature(buffer, 3) - mean[5]));
-  scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
-  scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
-                                      ccl_get_feature(buffer, 6) - mean[8],
-                                      ccl_get_feature(buffer, 7) - mean[9]));
-  if (use_time) {
-    scales[6] = fabsf(pixel.z - mean[10]);
-  }
-}
-
-ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
-{
-  scale[0] = 1.0f / max(scale[0], 0.01f);
-  scale[1] = 1.0f / max(scale[1], 0.01f);
-  scale[2] = 1.0f / max(scale[2], 0.01f);
-  if (use_time) {
-    scale[10] = 1.0f / max(scale[6], 0.01f);
-  }
-  scale[6] = 1.0f / max(scale[4], 0.01f);
-  scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f);
-  scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f);
-}
-
-ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
-                                          int pass_stride)
-{
-  return make_float3(
-      ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
-}
-
-ccl_device_inline void design_row_add(float *design_row,
-                                      int rank,
-                                      const ccl_global float *ccl_restrict transform,
-                                      int stride,
-                                      int row,
-                                      float feature,
-                                      int transform_row_stride)
-{
-  for (int i = 0; i < rank; i++) {
-    design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature;
-  }
-}
-
-/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(
-    int3 p_pixel,
-    const ccl_global float *ccl_restrict p_buffer,
-    int3 q_pixel,
-    const ccl_global float *ccl_restrict q_buffer,
-    int pass_stride,
-    int rank,
-    float *design_row,
-    const ccl_global float *ccl_restrict transform,
-    int stride,
-    bool use_time)
-{
-  int num_features = use_time ? 11 : 10;
-
-  design_row[0] = 1.0f;
-  math_vector_zero(design_row + 1, rank);
-
-#define DESIGN_ROW_ADD(I, F) \
-  design_row_add(design_row, rank, transform, stride, I, F, num_features);
-  DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
-  DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
-  DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
-  DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
-  DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
-  DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
-  DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
-  DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
-  DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
-  DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
-  if (use_time) {
-    DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
-  }
-#undef DESIGN_ROW_ADD
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
deleted file mode 100644
index 59d4ace2bef..00000000000
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
- * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set
- * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW_SSE \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    float4 t4 = make_float4(pixel.z); \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      float4 y4 = make_float4(pixel.y); \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-        float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
-        int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features_sse(float4 x,
-                                               float4 y,
-                                               float4 t,
-                                               int4 active_pixels,
-                                               const float *ccl_restrict buffer,
-                                               float4 *features,
-                                               bool use_time,
-                                               const float4 *ccl_restrict mean,
-                                               int pass_stride)
-{
-  int num_features = use_time ? 11 : 10;
-
-  features[0] = x;
-  features[1] = y;
-  features[2] = fabs(ccl_get_feature_sse(0));
-  features[3] = ccl_get_feature_sse(1);
-  features[4] = ccl_get_feature_sse(2);
-  features[5] = ccl_get_feature_sse(3);
-  features[6] = ccl_get_feature_sse(4);
-  features[7] = ccl_get_feature_sse(5);
-  features[8] = ccl_get_feature_sse(6);
-  features[9] = ccl_get_feature_sse(7);
-  if (use_time) {
-    features[10] = t;
-  }
-
-  if (mean) {
-    for (int i = 0; i < num_features; i++) {
-      features[i] = features[i] - mean[i];
-    }
-  }
-  for (int i = 0; i < num_features; i++) {
-    features[i] = mask(active_pixels, features[i]);
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales_sse(float4 x,
-                                                     float4 y,
-                                                     float4 t,
-                                                     int4 active_pixels,
-                                                     const float *ccl_restrict buffer,
-                                                     float4 *scales,
-                                                     bool use_time,
-                                                     const float4 *ccl_restrict mean,
-                                                     int pass_stride)
-{
-  scales[0] = fabs(x - mean[0]);
-  scales[1] = fabs(y - mean[1]);
-  scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
-  scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) +
-              sqr(ccl_get_feature_sse(3) - mean[5]);
-  scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
-  scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) +
-              sqr(ccl_get_feature_sse(7) - mean[9]);
-  if (use_time) {
-    scales[6] = fabs(t - mean[10]);
-  }
-
-  for (int i = 0; i < (use_time ? 7 : 6); i++)
-    scales[i] = mask(active_pixels, scales[i]);
-}
-
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
-{
-  scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
-  scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
-  scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
-  if (use_time) {
-    scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
-  }
-  scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
-  scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
-  scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
deleted file mode 100644
index 2ef03dc0a02..00000000000
--- a/intern/cycles/kernel/filter/filter_kernel.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_color.h"
-#include "util/util_math.h"
-#include "util/util_math_fast.h"
-#include "util/util_texture.h"
-
-#include "util/util_atomic.h"
-#include "util/util_math_matrix.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "kernel/filter/filter_features.h"
-#ifdef __KERNEL_SSE3__
-#  include "kernel/filter/filter_features_sse.h"
-#endif
-
-#include "kernel/filter/filter_prefilter.h"
-
-#ifdef __KERNEL_GPU__
-#  include "kernel/filter/filter_transform_gpu.h"
-#else
-#  ifdef __KERNEL_SSE3__
-#    include "kernel/filter/filter_transform_sse.h"
-#  else
-#    include "kernel/filter/filter_transform.h"
-#  endif
-#endif
-
-#include "kernel/filter/filter_reconstruction.h"
-
-#ifdef __KERNEL_CPU__
-#  include "kernel/filter/filter_nlm_cpu.h"
-#else
-#  include "kernel/filter/filter_nlm_gpu.h"
-#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
deleted file mode 100644
index 24200c29203..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs))))
-#define load4_u(buf, ofs) load_float4((buf) + (ofs))
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx,
-                                                         int dy,
-                                                         const float *ccl_restrict weight_image,
-                                                         const float *ccl_restrict variance_image,
-                                                         const float *ccl_restrict scale_image,
-                                                         float *difference_image,
-                                                         int4 rect,
-                                                         int stride,
-                                                         int channel_offset,
-                                                         int frame_offset,
-                                                         float a,
-                                                         float k_2)
-{
-  /* Strides need to be aligned to 16 bytes. */
-  kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
-
-  int aligned_lowx = rect.x & (~3);
-  const int numChannels = (channel_offset > 0) ? 3 : 1;
-  const float4 channel_fac = make_float4(1.0f / numChannels);
-
-  for (int y = rect.y; y < rect.w; y++) {
-    int idx_p = y * stride + aligned_lowx;
-    int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset;
-    for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
-      float4 diff = make_float4(0.0f);
-      float4 scale_fac;
-      if (scale_image) {
-        scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
-                          make_float4(0.25f),
-                          make_float4(4.0f));
-      }
-      else {
-        scale_fac = make_float4(1.0f);
-      }
-      for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
-        /* idx_p is guaranteed to be aligned, but idx_q isn't. */
-        float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
-        float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs);
-        float4 cdiff = color_p - color_q;
-        float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
-        float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs);
-        diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) /
-                (make_float4(1e-8f) + k_2 * (var_p + var_q));
-      }
-      load4_a(difference_image, idx_p) = diff * channel_fac;
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    const int low = max(rect.y, y - f);
-    const int high = min(rect.w, y + f + 1);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-    for (int y1 = low; y1 < high; y1++) {
-      for (int x = aligned_lowx; x < rect.z; x += 4) {
-        load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x);
-      }
-    }
-    float fac = 1.0f / (high - low);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) *= fac;
-    }
-  }
-}
-
-ccl_device_inline void nlm_blur_horizontal(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-  }
-
-  for (int dx = -f; dx <= f; dx++) {
-    aligned_lowx = round_down(rect.x - min(0, dx), 4);
-    int highx = rect.z - max(0, dx);
-    int4 lowx4 = make_int4(rect.x - min(0, dx));
-    int4 highx4 = make_int4(rect.z - max(0, dx));
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = aligned_lowx; x < highx; x += 4) {
-        int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-        int4 active = (x4 >= lowx4) & (x4 < highx4);
-
-        float4 diff = load4_u(difference_image, y * stride + x + dx);
-        load4_a(out_image, y * stride + x) += mask(active, diff);
-      }
-    }
-  }
-
-  aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
-      float4 low = max(make_float4(rect.x), x4 - make_float4(f));
-      float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1));
-      load4_a(out_image, y * stride + x) *= rcp(high - low);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = fast_expf4(
-          -max(load4_a(out_image, y * stride + x), make_float4(0.0f)));
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int dx,
-                                                       int dy,
-                                                       const float *ccl_restrict difference_image,
-                                                       const float *ccl_restrict image,
-                                                       float *temp_image,
-                                                       float *out_image,
-                                                       float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-      int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
-
-      int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-
-      float4 weight = load4_a(temp_image, idx_p);
-      load4_a(accum_image, idx_p) += mask(active, weight);
-
-      float4 val = load4_u(image, idx_q);
-      if (channel_offset) {
-        val += load4_u(image, idx_q + channel_offset);
-        val += load4_u(image, idx_q + 2 * channel_offset);
-        val *= 1.0f / 3.0f;
-      }
-
-      load4_a(out_image, idx_p) += mask(active, weight * val);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
-                                                           int dy,
-                                                           int t,
-                                                           const float *ccl_restrict
-                                                               difference_image,
-                                                           const float *ccl_restrict buffer,
-                                                           float *transform,
-                                                           int *rank,
-                                                           float *XtWX,
-                                                           float3 *XtWY,
-                                                           int4 rect,
-                                                           int4 filter_window,
-                                                           int stride,
-                                                           int f,
-                                                           int pass_stride,
-                                                           int frame_offset,
-                                                           bool use_time)
-{
-  int4 clip_area = rect_clip(rect, filter_window);
-  /* fy and fy are in filter-window-relative coordinates,
-   * while x and y are in feature-window-relative coordinates. */
-  for (int y = clip_area.y; y < clip_area.w; y++) {
-    for (int x = clip_area.x; x < clip_area.z; x++) {
-      const int low = max(rect.x, x - f);
-      const int high = min(rect.z, x + f + 1);
-      float sum = 0.0f;
-      for (int x1 = low; x1 < high; x1++) {
-        sum += difference_image[y * stride + x1];
-      }
-      float weight = sum * (1.0f / (high - low));
-
-      int storage_ofs = coord_to_local_index(filter_window, x, y);
-      float *l_transform = transform + storage_ofs * TRANSFORM_SIZE;
-      float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE;
-      float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE;
-      int *l_rank = rank + storage_ofs;
-
-      kernel_filter_construct_gramian(x,
-                                      y,
-                                      1,
-                                      dx,
-                                      dy,
-                                      t,
-                                      stride,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time,
-                                      buffer,
-                                      l_transform,
-                                      l_rank,
-                                      weight,
-                                      l_XtWX,
-                                      l_XtWY,
-                                      0);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
-                                                   const float *ccl_restrict accum_image,
-                                                   int4 rect,
-                                                   int w)
-{
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = rect.x; x < rect.z; x++) {
-      out_image[y * w + x] /= accum_image[y * w + x];
-    }
-  }
-}
-
-#undef load4_a
-#undef load4_u
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
deleted file mode 100644
index 650c743f34f..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Determines pixel coordinates and offset for the current thread.
- * Returns whether the thread should do any work.
- *
- * All coordinates are relative to the denoising buffer!
- *
- * Window is the rect that should be processed.
- * co is filled with (x, y, dx, dy).
- */
-ccl_device_inline bool get_nlm_coords_window(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
-{
-  /* Determine the pixel offset that this thread should apply. */
-  int s = 2 * r + 1;
-  int si = ccl_global_id(1);
-  int sx = si % s;
-  int sy = si / s;
-  if (sy >= s) {
-    return false;
-  }
-
-  /* Pixels still need to lie inside the denoising buffer after applying the offset,
-   * so determine the area for which this is the case. */
-  int dx = sx - r;
-  int dy = sy - r;
-
-  *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
-
-  /* Find the intersection of the area that we want to process (window) and the area
-   * that can be processed (rect) to get the final area for this offset. */
-  int4 clip_area = rect_clip(window, *rect);
-
-  /* If the radius is larger than one of the sides of the window,
-   * there will be shifts for which there is no usable pixel at all. */
-  if (!rect_is_valid(clip_area)) {
-    return false;
-  }
-
-  /* Map the linear thread index to pixels inside the clip area. */
-  int x, y;
-  if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
-    return false;
-  }
-
-  *co = make_int4(x, y, dx, dy);
-
-  *ofs = (sy * s + sx) * stride;
-
-  return true;
-}
-
-ccl_device_inline bool get_nlm_coords(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
-{
-  return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    const ccl_global float *ccl_restrict weight_image,
-    const ccl_global float *ccl_restrict variance_image,
-    const ccl_global float *ccl_restrict scale_image,
-    ccl_global float *difference_image,
-    int4 rect,
-    int stride,
-    int channel_offset,
-    int frame_offset,
-    float a,
-    float k_2)
-{
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
-  int numChannels = channel_offset ? 3 : 1;
-
-  float diff = 0.0f;
-  float scale_fac = 1.0f;
-  if (scale_image) {
-    scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
-  }
-
-  for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
-    float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
-    float pvar = variance_image[idx_p];
-    float qvar = sqr(scale_fac) * variance_image[idx_q];
-    diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
-  }
-  if (numChannels > 1) {
-    diff *= 1.0f / numChannels;
-  }
-  difference_image[y * stride + x] = diff;
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(int x,
-                                              int y,
-                                              const ccl_global float *ccl_restrict
-                                                  difference_image,
-                                              ccl_global float *out_image,
-                                              int4 rect,
-                                              int stride,
-                                              int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.y, y - f);
-  const int high = min(rect.w, y + f + 1);
-  for (int y1 = low; y1 < high; y1++) {
-    sum += difference_image[y1 * stride + x];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = sum;
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
-                                                     int y,
-                                                     const ccl_global float *ccl_restrict
-                                                         difference_image,
-                                                     ccl_global float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int x,
-                                                       int y,
-                                                       int dx,
-                                                       int dy,
-                                                       const ccl_global float *ccl_restrict
-                                                           difference_image,
-                                                       const ccl_global float *ccl_restrict image,
-                                                       ccl_global float *out_image,
-                                                       ccl_global float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-  if (out_image) {
-    atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
-    float val = image[idx_q];
-    if (channel_offset) {
-      val += image[idx_q + channel_offset];
-      val += image[idx_q + 2 * channel_offset];
-      val *= 1.0f / 3.0f;
-    }
-    atomic_add_and_fetch_float(out_image + idx_p, sum * val);
-  }
-  else {
-    accum_image[idx_p] = sum;
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    int t,
-    const ccl_global float *ccl_restrict difference_image,
-    const ccl_global float *ccl_restrict buffer,
-    const ccl_global float *ccl_restrict transform,
-    ccl_global int *rank,
-    ccl_global float *XtWX,
-    ccl_global float3 *XtWY,
-    int4 rect,
-    int4 filter_window,
-    int stride,
-    int f,
-    int pass_stride,
-    int frame_offset,
-    bool use_time,
-    int localIdx)
-{
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  float sum = 0.0f;
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  float weight = sum * (1.0f / (high - low));
-
-  /* Reconstruction data is only stored for pixels inside the filter window,
-   * so compute the pixels's index in there. */
-  int storage_ofs = coord_to_local_index(filter_window, x, y);
-  transform += storage_ofs;
-  rank += storage_ofs;
-  XtWX += storage_ofs;
-  XtWY += storage_ofs;
-
-  kernel_filter_construct_gramian(x,
-                                  y,
-                                  rect_size(filter_window),
-                                  dx,
-                                  dy,
-                                  t,
-                                  stride,
-                                  pass_stride,
-                                  frame_offset,
-                                  use_time,
-                                  buffer,
-                                  transform,
-                                  rank,
-                                  weight,
-                                  XtWX,
-                                  XtWY,
-                                  localIdx);
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(int x,
-                                                   int y,
-                                                   ccl_global float *out_image,
-                                                   const ccl_global float *ccl_restrict
-                                                       accum_image,
-                                                   int stride)
-{
-  out_image[y * stride + x] /= accum_image[y * stride + x];
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
deleted file mode 100644
index 97cecba190e..00000000000
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/**
- * First step of the shadow prefiltering, performs the shadow division and stores all data
- * in a nice and easy rectangular array that can be passed to the NLM filter.
- *
- * Calculates:
- * \param unfiltered: Contains the two half images of the shadow feature pass
- * \param sampleVariance: The sample-based variance calculated in the kernel.
- * Note: This calculation is biased in general,
- * and especially here since the variance of the ratio can only be approximated.
- * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy
- * (since it's essentially the buffer variance of the two variance halves)
- * \param bufferVariance: The buffer-based variance of the shadow feature.
- * Unbiased, but quite noisy.
- */
-ccl_device void kernel_filter_divide_shadow(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int x,
-                                            int y,
-                                            ccl_global float *unfilteredA,
-                                            ccl_global float *unfilteredB,
-                                            ccl_global float *sampleVariance,
-                                            ccl_global float *sampleVarianceV,
-                                            ccl_global float *bufferVariance,
-                                            int4 rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-
-  int offset = tile_info->offsets[tile];
-  int stride = tile_info->strides[tile];
-  const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer(
-      tile);
-  center_buffer += (y * stride + x + offset) * buffer_pass_stride;
-  center_buffer += buffer_denoising_offset + 14;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-  unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
-  unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
-
-  float varA = center_buffer[2];
-  float varB = center_buffer[5];
-  int odd_sample = (sample + 1) / 2;
-  int even_sample = sample / 2;
-
-  /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-   * update does not work efficiently with atomics in the kernel. */
-  varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample);
-  varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample);
-
-  varA /= max(odd_sample - 1, 1);
-  varB /= max(even_sample - 1, 1);
-
-  sampleVariance[idx] = 0.5f * (varA + varB) / sample;
-  sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample);
-  bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) *
-                        (unfilteredA[idx] - unfilteredB[idx]);
-}
-
-/* Load a regular feature from the render buffers into the denoise buffer.
- * Parameters:
- * - sample: The sample amount in the buffer, used to normalize the buffer.
- * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
- * - x, y: Current pixel
- * - mean, variance: Target denoise buffers.
- * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
- */
-ccl_device void kernel_filter_get_feature(int sample,
-                                          CCL_FILTER_TILE_INFO,
-                                          int m_offset,
-                                          int v_offset,
-                                          int x,
-                                          int y,
-                                          ccl_global float *mean,
-                                          ccl_global float *variance,
-                                          float scale,
-                                          int4 rect,
-                                          int buffer_pass_stride,
-                                          int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-  ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) +
-                                    (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) *
-                                        buffer_pass_stride +
-                                    buffer_denoising_offset;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float val = scale * center_buffer[m_offset];
-  mean[idx] = val;
-
-  if (v_offset >= 0) {
-    if (sample > 1) {
-      /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-       * update does not work efficiently with atomics in the kernel. */
-      variance[idx] = max(
-          0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1)));
-    }
-    else {
-      /* Can't compute variance with single sample, just set it very high. */
-      variance[idx] = 1e10f;
-    }
-  }
-}
-
-ccl_device void kernel_filter_write_feature(int sample,
-                                            int x,
-                                            int y,
-                                            int4 buffer_params,
-                                            ccl_global float *from,
-                                            ccl_global float *buffer,
-                                            int out_offset,
-                                            int4 rect)
-{
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  combined_buffer[out_offset] = from[idx];
-}
-
-#define GET_COLOR(image) \
-  make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride])
-#define SET_COLOR(image, color) \
-  image[idx] = color.x; \
-  image[idx + pass_stride] = color.y; \
-  image[idx + 2 * pass_stride] = color.z
-
-ccl_device void kernel_filter_detect_outliers(int x,
-                                              int y,
-                                              ccl_global float *in,
-                                              ccl_global float *variance_out,
-                                              ccl_global float *depth,
-                                              ccl_global float *image_out,
-                                              int4 rect,
-                                              int pass_stride)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  ccl_global float *image_in = in;
-  ccl_global float *variance_in = in + 3 * pass_stride;
-
-  int n = 0;
-  float values[25];
-  float pixel_variance, max_variance = 0.0f;
-  for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
-    for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
-      int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
-      float3 color = GET_COLOR(image_in);
-      color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-      float L = average(color);
-
-      /* Find the position of L. */
-      int i;
-      for (i = 0; i < n; i++) {
-        if (values[i] > L)
-          break;
-      }
-      /* Make space for L by shifting all following values to the right. */
-      for (int j = n; j > i; j--) {
-        values[j] = values[j - 1];
-      }
-      /* Insert L. */
-      values[i] = L;
-      n++;
-
-      float3 pixel_var = GET_COLOR(variance_in);
-      float var = average(pixel_var);
-      if ((x1 == x) && (y1 == y)) {
-        pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
-                                                                                            var;
-      }
-      else {
-        max_variance = max(max_variance, var);
-      }
-    }
-  }
-
-  max_variance += 1e-4f;
-
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float3 color = GET_COLOR(image_in);
-  float3 variance = GET_COLOR(variance_in);
-  color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-  variance = max(variance, make_float3(0.0f, 0.0f, 0.0f));
-
-  float L = average(color);
-
-  float ref = 2.0f * values[(int)(n * 0.75f)];
-
-  /* Slightly offset values to avoid false positives in (almost) black areas. */
-  max_variance += 1e-5f;
-  ref -= 1e-5f;
-
-  if (L > ref) {
-    /* The pixel appears to be an outlier.
-     * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is
-     * that the pixel should actually be at the reference value: If the reference is within the
-     * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very
-     * unlikely that the pixel should be darker, which indicates a legitimate highlight.
-     */
-
-    if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
-      depth[idx] = -depth[idx];
-      color *= ref / L;
-      variance = make_float3(max_variance, max_variance, max_variance);
-    }
-    else {
-      float stddev = sqrtf(pixel_variance);
-      if (L - 3 * stddev < ref) {
-        /* The pixel is an outlier, so negate the depth value to mark it as one.
-         * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM
-         * weights. */
-        depth[idx] = -depth[idx];
-        float fac = ref / L;
-        color *= fac;
-        variance *= sqr(fac);
-      }
-    }
-  }
-
-  /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results.
-   * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we
-   * scale by the square of that (since we have variance instead of standard deviation). */
-  color = color_highlight_compress(color, &variance);
-
-  SET_COLOR(image_out, color);
-  SET_COLOR(variance_out, variance);
-}
-
-#undef GET_COLOR
-#undef SET_COLOR
-
-/* Combine A/B buffers.
- * Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x,
-                                             int y,
-                                             ccl_global float *mean,
-                                             ccl_global float *variance,
-                                             ccl_global float *a,
-                                             ccl_global float *b,
-                                             int4 rect,
-                                             int r)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  if (mean)
-    mean[idx] = 0.5f * (a[idx] + b[idx]);
-  if (variance) {
-    if (r == 0)
-      variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]);
-    else {
-      variance[idx] = 0.0f;
-      float values[25];
-      int numValues = 0;
-      for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) {
-        for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) {
-          int pidx = (py - rect.y) * buffer_w + (px - rect.x);
-          values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]);
-        }
-      }
-      /* Insertion-sort the variances (fast enough for 25 elements). */
-      for (int i = 1; i < numValues; i++) {
-        float v = values[i];
-        int j;
-        for (j = i - 1; j >= 0 && values[j] > v; j--)
-          values[j + 1] = values[j];
-        values[j + 1] = v;
-      }
-      variance[idx] = values[(7 * numValues) / 8];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
deleted file mode 100644
index 17941689ad5..00000000000
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_filter_construct_gramian(int x,
-                                                       int y,
-                                                       int storage_stride,
-                                                       int dx,
-                                                       int dy,
-                                                       int t,
-                                                       int buffer_stride,
-                                                       int pass_stride,
-                                                       int frame_offset,
-                                                       bool use_time,
-                                                       const ccl_global float *ccl_restrict buffer,
-                                                       const ccl_global float *ccl_restrict
-                                                           transform,
-                                                       ccl_global int *rank,
-                                                       float weight,
-                                                       ccl_global float *XtWX,
-                                                       ccl_global float3 *XtWY,
-                                                       int localIdx)
-{
-  if (weight < 1e-3f) {
-    return;
-  }
-
-  int p_offset = y * buffer_stride + x;
-  int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;
-
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
-#else
-  float design_row[DENOISE_FEATURES + 1];
-#endif
-
-  float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
-
-  /* If the pixel was flagged as an outlier during prefiltering, skip it. */
-  if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
-    return;
-  }
-
-  filter_get_design_row_transform(make_int3(x, y, t),
-                                  buffer + p_offset,
-                                  make_int3(x + dx, y + dy, t),
-                                  buffer + q_offset,
-                                  pass_stride,
-                                  *rank,
-                                  design_row,
-                                  transform,
-                                  stride,
-                                  use_time);
-
-#ifdef __KERNEL_GPU__
-  math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
-  math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
-#else
-  math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
-  math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
-#endif
-}
-
-ccl_device_inline void kernel_filter_finalize(int x,
-                                              int y,
-                                              ccl_global float *buffer,
-                                              ccl_global int *rank,
-                                              int storage_stride,
-                                              ccl_global float *XtWX,
-                                              ccl_global float3 *XtWY,
-                                              int4 buffer_params,
-                                              int sample)
-{
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-  if (XtWX[0] < 1e-3f) {
-    /* There is not enough information to determine a denoised result.
-     * As a fallback, keep the original value of the pixel. */
-    return;
-  }
-
-  /* The weighted average of pixel colors (essentially, the NLM-filtered image).
-   * In case the solution of the linear model fails due to numerical issues or
-   * returns nonsensical negative values, fall back to this value. */
-  float3 mean_color = XtWY[0] / XtWX[0];
-
-  math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
-
-  float3 final_color = XtWY[0];
-  if (!isfinite3_safe(final_color) ||
-      (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
-    final_color = mean_color;
-  }
-
-  /* Clamp pixel value to positive values and reverse the highlight compression transform. */
-  final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f)));
-
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-  if (buffer_params.w >= 0) {
-    final_color *= sample;
-    if (buffer_params.w > 0) {
-      final_color.x += combined_buffer[buffer_params.w + 0];
-      final_color.y += combined_buffer[buffer_params.w + 1];
-      final_color.z += combined_buffer[buffer_params.w + 2];
-    }
-  }
-  combined_buffer[0] = final_color.x;
-  combined_buffer[1] = final_color.y;
-  combined_buffer[2] = final_color.z;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
deleted file mode 100644
index 880a661214e..00000000000
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float features[DENOISE_FEATURES];
-
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < (*rank); i++) {
-    math_vector_mul(transform + i * num_features, feature_scale, num_features);
-  }
-  math_matrix_transpose(transform, num_features, 1);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
deleted file mode 100644
index ec258a5212a..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  ccl_global float *transform,
-                                                  ccl_global int *rank,
-                                                  int radius,
-                                                  float pca_threshold,
-                                                  int transform_stride,
-                                                  int localIdx)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES;
-#else
-  float features[DENOISE_FEATURES];
-#endif
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-  const ccl_global float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, transform_stride);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    for (int j = 0; j < (*rank); j++) {
-      transform[(i * num_features + j) * transform_stride] *= feature_scale[i];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
deleted file mode 100644
index 0304d990f9f..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float4 features[DENOISE_FEATURES];
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float4 feature_means[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_means, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add_sse(feature_means, num_features, features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float4 pixel_scale = make_float4(1.0f / num_pixels);
-  for (int i = 0; i < num_features; i++) {
-    feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
-  }
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float4 feature_scale[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_scale, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_feature_scales_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max_sse(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  filter_calculate_scale_sse(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero_sse(feature_matrix_sse, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul_sse(features, num_features, feature_scale);
-    math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, 1);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 5ff4d5f7053..4de824cc277 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/geom/geom_attribute.h"
 #include "kernel/geom/geom_object.h"
@@ -31,4 +33,5 @@
 #include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
+#include "kernel/geom/geom_shader_data.h"
 // clang-format on
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index b37797ac21b..9532a21fec7 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Attributes
@@ -25,9 +27,9 @@ CCL_NAMESPACE_BEGIN
  * Lookup of attributes is different between OSL and SVM, as OSL is ustring
  * based while for SVM we use integer ids. */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd);
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd);
 
-ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint attribute_primitive_type(const KernelGlobals *kg, const ShaderData *sd)
 {
   if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
     return ATTR_PRIM_SUBD;
@@ -46,12 +48,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 /* Find attribute based on ID */
 
-ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_attribute_map_offset(const KernelGlobals *kg, int object)
 {
   return kernel_tex_fetch(__objects, object).attribute_map_offset;
 }
 
-ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
+ccl_device_inline AttributeDescriptor find_attribute(const KernelGlobals *kg,
                                                      const ShaderData *sd,
                                                      uint id)
 {
@@ -98,7 +100,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
 
 /* Transform matrix attribute on meshes */
 
-ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg,
+ccl_device Transform primitive_attribute_matrix(const KernelGlobals *kg,
                                                 const ShaderData *sd,
                                                 const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b5a62a31ca9..a827a67ce7a 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
@@ -25,8 +27,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Reading attributes on various curve elements */
 
-ccl_device float curve_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float curve_attribute_float(const KernelGlobals *kg,
+                                       const ShaderData *sd,
+                                       const AttributeDescriptor desc,
+                                       float *dx,
+                                       float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
     float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
@@ -64,7 +69,7 @@ ccl_device float curve_attribute_float(
   }
 }
 
-ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
+ccl_device float2 curve_attribute_float2(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float2 *dx,
@@ -110,7 +115,7 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
+ccl_device float3 curve_attribute_float3(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float3 *dx,
@@ -152,7 +157,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
+ccl_device float4 curve_attribute_float4(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float4 *dx,
@@ -196,7 +201,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
 
 /* Curve thickness */
 
-ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+ccl_device float curve_thickness(const KernelGlobals *kg, const ShaderData *sd)
 {
   float r = 0.0f;
 
@@ -224,7 +229,7 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 /* Curve location for motion pass, linear interpolation between keys and
  * ignoring radius because we do the same for the motion keys */
 
-ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_motion_center_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
   int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -240,7 +245,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 
 /* Curve tangent normal */
 
-ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_tangent_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 tgN = make_float3(0.0f, 0.0f, 0.0f);
 
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index e25bf5b4660..213f3e62ee0 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve primitive intersection functions.
@@ -167,6 +169,7 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
 }
 
 ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+                                          float *ray_tfar,
                                           const float dt,
                                           const float4 curve[4],
                                           float u,
@@ -230,7 +233,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
     if (fabsf(f) < f_err && fabsf(g) < g_err) {
       t += dt;
-      if (!(0.0f <= t && t <= isect->t)) {
+      if (!(0.0f <= t && t <= *ray_tfar)) {
         return false; /* Rejects NaNs */
       }
       if (!(u >= 0.0f && u <= 1.0f)) {
@@ -247,6 +250,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
       }
 
       /* Record intersection. */
+      *ray_tfar = t;
       isect->t = t;
       isect->u = u;
       isect->v = 0.0f;
@@ -259,6 +263,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
 ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           const float3 ray_dir,
+                                          float ray_tfar,
                                           float4 curve[4],
                                           Intersection *isect)
 {
@@ -339,7 +344,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       }
 
       /* Intersect with cap-planes. */
-      float2 tp = make_float2(-dt, isect->t - dt);
+      float2 tp = make_float2(-dt, ray_tfar - dt);
       tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
       const float2 h0 = half_plane_intersect(
           float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
@@ -402,19 +407,19 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
         }
         else {
           recurse = true;
         }
       }
 
-      if (valid1 && (tp1.x + dt <= isect->t)) {
+      if (valid1 && (tp1.x + dt <= ray_tfar)) {
         const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
         }
         else {
           recurse = true;
@@ -542,7 +547,7 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
 
 ccl_device_inline bool ribbon_intersect(const float3 ray_org,
                                         const float3 ray_dir,
-                                        const float ray_tfar,
+                                        float ray_tfar,
                                         const int N,
                                         float4 curve[4],
                                         Intersection *isect)
@@ -590,7 +595,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
 
       /* Intersect quad. */
       float vu, vv, vt;
-      bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt);
+      bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt);
 
       if (valid0) {
         /* ignore self intersections */
@@ -604,6 +609,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
           vv = 2.0f * vv - 1.0f;
 
           /* Record intersection. */
+          ray_tfar = vt;
           isect->t = vt;
           isect->u = u + vu * step_size;
           isect->v = vv;
@@ -619,10 +625,11 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
   return false;
 }
 
-ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
+ccl_device_forceinline bool curve_intersect(const KernelGlobals *kg,
                                             Intersection *isect,
                                             const float3 P,
                                             const float3 dir,
+                                            const float tmax,
                                             uint visibility,
                                             int object,
                                             int curveAddr,
@@ -672,7 +679,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* todo: adaptive number of subdivisions could help performance here. */
     const int subdivisions = kernel_data.bvh.curve_subdivisions;
-    if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) {
+    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -682,7 +689,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
     return false;
   }
   else {
-    if (curve_intersect_recursive(P, dir, curve, isect)) {
+    if (curve_intersect_recursive(P, dir, tmax, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -693,28 +700,23 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
+ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
                                           ShaderData *sd,
-                                          const Intersection *isect,
-                                          const Ray *ray)
+                                          float3 P,
+                                          float3 D,
+                                          float t,
+                                          const int isect_object,
+                                          const int isect_prim)
 {
-  float t = isect->t;
-  float3 P = ray->P;
-  float3 D = ray->D;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
     D = normalize_len(D, &t);
   }
 
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  int prim = kernel_tex_fetch(__prim_index, isect_prim);
   float4 v00 = kernel_tex_fetch(__curves, prim);
 
   int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -735,23 +737,20 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
   }
 
-  sd->u = isect->u;
-
   P = P + D * t;
 
-  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u);
+  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u);
   const float3 dPdu = float4_to_float3(dPdu4);
 
   if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
     const float3 tangent = normalize(dPdu);
     const float3 bitangent = normalize(cross(tangent, -D));
-    const float sine = isect->v;
+    const float sine = sd->v;
     const float cosine = safe_sqrtf(1.0f - sine * sine);
 
     sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
     sd->Ng = -D;
-    sd->v = isect->v;
 
 #  if 0
     /* This approximates the position and geometric normal of a thick curve too,
@@ -765,7 +764,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     /* Thick curves, compute normal using direction from inside the curve.
      * This could be optimized by recording the normal in the intersection,
      * however for Optix this would go beyond the size of the payload. */
-    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
+    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
     const float3 Ng = normalize(P - P_inside);
 
     sd->N = Ng;
@@ -779,13 +778,8 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
   sd->dPdv = cross(dPdu, sd->Ng);
 #  endif
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 0f66f4af755..5294da03145 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Motion Curve Primitive
@@ -25,7 +27,7 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
-ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_curve_motion(const KernelGlobals *kg,
                                                   int object,
                                                   uint id,
                                                   AttributeElement *elem)
@@ -50,7 +52,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step_linear(const KernelGlobals *kg,
                                                          int offset,
                                                          int numkeys,
                                                          int numsteps,
@@ -78,7 +80,7 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
 
 /* return 2 curve key locations */
 ccl_device_inline void motion_curve_keys_linear(
-    KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
+    const KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
 {
   /* get motion info */
   int numsteps, numkeys;
@@ -105,7 +107,7 @@ ccl_device_inline void motion_curve_keys_linear(
   keys[1] = (1.0f - t) * keys[1] + t * next_keys[1];
 }
 
-ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step(const KernelGlobals *kg,
                                                   int offset,
                                                   int numkeys,
                                                   int numsteps,
@@ -138,7 +140,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_curve_keys(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys(const KernelGlobals *kg,
                                          int object,
                                          int prim,
                                          float time,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 53d6b92dd7e..eb4a39e062b 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -25,11 +25,13 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Time interpolation of vertex positions and normals */
 
-ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
                                             int object,
                                             uint id,
                                             AttributeElement *elem)
@@ -49,7 +51,7 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_verts_for_step(const KernelGlobals *kg,
                                                       uint4 tri_vindex,
                                                       int offset,
                                                       int numverts,
@@ -76,7 +78,7 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_normals_for_step(const KernelGlobals *kg,
                                                         uint4 tri_vindex,
                                                         int offset,
                                                         int numverts,
@@ -104,7 +106,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
 }
 
 ccl_device_inline void motion_triangle_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
 {
   /* get motion info */
   int numsteps, numverts;
@@ -134,7 +136,7 @@ ccl_device_inline void motion_triangle_vertices(
 }
 
 ccl_device_inline float3 motion_triangle_smooth_normal(
-    KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
+    const KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
 {
   /* get motion info */
   int numsteps, numverts;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index 859d919f0bb..ec7e4b07d76 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
@@ -32,23 +34,21 @@ CCL_NAMESPACE_BEGIN
  * a closer distance.
  */
 
-ccl_device_inline float3 motion_triangle_refine(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+ccl_device_inline float3 motion_triangle_refine(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float3 P,
+                                                float3 D,
+                                                float t,
+                                                const int isect_object,
+                                                const int isect_prim,
+                                                float3 verts[3])
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -70,13 +70,8 @@ ccl_device_inline float3 motion_triangle_refine(
   /* Compute refined position. */
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -86,7 +81,7 @@ ccl_device_inline float3 motion_triangle_refine(
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space
+/* Same as above, except that t is assumed to be in object space
  * for instancing.
  */
 
@@ -97,27 +92,22 @@ ccl_device_noinline
 ccl_device_inline
 #  endif
     float3
-    motion_triangle_refine_local(KernelGlobals *kg,
+    motion_triangle_refine_local(const KernelGlobals *kg,
                                  ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray,
+                                 float3 P,
+                                 float3 D,
+                                 float t,
+                                 const int isect_object,
+                                 const int isect_prim,
                                  float3 verts[3])
 {
 #  ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return motion_triangle_refine(kg, sd, isect, ray, verts);
+  /* t is always in world space with OptiX. */
+  return motion_triangle_refine(kg, sd, P, D, t, isect_object, isect_prim, verts);
 #  else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #    ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#      endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -138,13 +128,8 @@ ccl_device_inline
 
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#      endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -160,10 +145,11 @@ ccl_device_inline
  * time and do a ray intersection with the resulting triangle.
  */
 
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect(const KernelGlobals *kg,
                                                  Intersection *isect,
                                                  float3 P,
                                                  float3 dir,
+                                                 float tmax,
                                                  float time,
                                                  uint visibility,
                                                  int object,
@@ -179,7 +165,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              (ssef *)verts,
 #else
@@ -215,7 +201,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
  * Returns whether traversal should be stopped.
  */
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect_local(const KernelGlobals *kg,
                                                        LocalIntersection *local_isect,
                                                        float3 P,
                                                        float3 dir,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 7a91f8041f7..85c4f0ca522 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Setup of motion triangle specific parts of ShaderData, moved into this one
@@ -32,8 +34,14 @@ CCL_NAMESPACE_BEGIN
  * normals */
 
 /* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool is_local)
+ccl_device_noinline void motion_triangle_shader_setup(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      const float3 P,
+                                                      const float3 D,
+                                                      const float ray_t,
+                                                      const int isect_object,
+                                                      const int isect_prim,
+                                                      bool is_local)
 {
   /* Get shader. */
   sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
@@ -63,12 +71,12 @@ ccl_device_noinline void motion_triangle_shader_setup(
   /* Compute refined position. */
 #ifdef __BVH_LOCAL__
   if (is_local) {
-    sd->P = motion_triangle_refine_local(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine_local(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   else
 #endif /* __BVH_LOCAL__*/
   {
-    sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   /* Compute face normal. */
   float3 Ng;
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index fe73335a335..7d6ad7b4fe3 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -22,6 +22,8 @@
  * directly primitives in the BVH with world space locations applied, and the object
  * ID is looked up afterwards. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Object attributes, for now a fixed size and contents */
@@ -35,7 +37,7 @@ enum ObjectVectorTransform { OBJECT_PASS_MOTION_PRE = 0, OBJECT_PASS_MOTION_POST
 
 /* Object to world space transformation */
 
-ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform(const KernelGlobals *kg,
                                                    int object,
                                                    enum ObjectTransform type)
 {
@@ -49,7 +51,7 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
 
 /* Lamp to world space transformation */
 
-ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
+ccl_device_inline Transform lamp_fetch_transform(const KernelGlobals *kg, int lamp, bool inverse)
 {
   if (inverse) {
     return kernel_tex_fetch(__lights, lamp).itfm;
@@ -61,7 +63,7 @@ ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bo
 
 /* Object to world space transformation for motion vectors */
 
-ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_motion_pass_transform(const KernelGlobals *kg,
                                                                int object,
                                                                enum ObjectVectorTransform type)
 {
@@ -72,7 +74,7 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg
 /* Motion blurred object transformations */
 
 #ifdef __OBJECT_MOTION__
-ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion(const KernelGlobals *kg,
                                                           int object,
                                                           float time)
 {
@@ -86,7 +88,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
   return tfm;
 }
 
-ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion_test(const KernelGlobals *kg,
                                                                int object,
                                                                float time,
                                                                Transform *itfm)
@@ -111,45 +113,79 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 }
 #endif
 
+/* Get transform matrix for shading point. */
+
+ccl_device_inline Transform object_get_transform(const KernelGlobals *kg, const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_tfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#endif
+}
+
+ccl_device_inline Transform object_get_inverse_transform(const KernelGlobals *kg,
+                                                         const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_itfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+}
 /* Transform position from object to world space */
 
-ccl_device_inline void object_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_position_transform(const KernelGlobals *kg,
                                                  const ShaderData *sd,
                                                  float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_tfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_tfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform position from world to object space */
 
-ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_position_transform(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_itfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_itfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform normal from world to object space */
 
-ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_normal_transform(const KernelGlobals *kg,
                                                        const ShaderData *sd,
                                                        float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
-    *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+      *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm_motion, *N));
+    }
+    return;
   }
-#else
+#endif
+
   if (sd->object != OBJECT_NONE) {
     Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
     *N = normalize(transform_direction_transposed(&tfm, *N));
@@ -158,65 +194,79 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
     Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
     *N = normalize(transform_direction_transposed(&tfm, *N));
   }
-#endif
 }
 
 /* Transform normal from object to world space */
 
-ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
+ccl_device_inline void object_normal_transform(const KernelGlobals *kg,
+                                               const ShaderData *sd,
+                                               float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm_motion, *N));
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *N = normalize(transform_direction_transposed(&tfm, *N));
-#endif
 }
 
 /* Transform direction vector from object to world space */
 
-ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
+ccl_device_inline void object_dir_transform(const KernelGlobals *kg,
+                                            const ShaderData *sd,
+                                            float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_tfm, *D);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_tfm_motion, *D);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *D = transform_direction(&tfm, *D);
-#endif
 }
 
 /* Transform direction vector from world to object space */
 
-ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_dir_transform(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_itfm, *D);
-#else
-  Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  *D = transform_direction(&tfm, *D);
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_itfm_motion, *D);
+    return;
+  }
 #endif
+
+  const Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+  *D = transform_direction(&tfm, *D);
 }
 
 /* Object center position */
 
-ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline float3 object_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-  return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    return make_float3(sd->ob_tfm_motion.x.w, sd->ob_tfm_motion.y.w, sd->ob_tfm_motion.z.w);
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-#endif
 }
 
 /* Color of the object */
 
-ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_color(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -227,7 +277,7 @@ ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
 
 /* Pass ID number of object */
 
-ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_pass_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -237,7 +287,7 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 
 /* Per lamp random number for shader variation */
 
-ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
+ccl_device_inline float lamp_random_number(const KernelGlobals *kg, int lamp)
 {
   if (lamp == LAMP_NONE)
     return 0.0f;
@@ -247,7 +297,7 @@ ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
 
 /* Per object random number for shader variation */
 
-ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
+ccl_device_inline float object_random_number(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -257,7 +307,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 
 /* Particle ID from which this object was generated */
 
-ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
+ccl_device_inline int object_particle_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -267,7 +317,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 
 /* Generated texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_generated(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -279,7 +329,7 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 
 /* UV texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_uv(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -291,7 +341,7 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 /* Information about mesh for motion blurred triangles and curves */
 
 ccl_device_inline void object_motion_info(
-    KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
+    const KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
 {
   if (numkeys) {
     *numkeys = kernel_tex_fetch(__objects, object).numkeys;
@@ -305,7 +355,7 @@ ccl_device_inline void object_motion_info(
 
 /* Offset to an objects patch map */
 
-ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_patch_map_offset(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -315,7 +365,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 /* Volume step size */
 
-ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_density(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return 1.0f;
@@ -324,7 +374,7 @@ ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).volume_density;
 }
 
-ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_step_size(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return kernel_data.background.volume_step_size;
@@ -335,14 +385,14 @@ ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
 
 /* Pass ID for shader */
 
-ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
+ccl_device int shader_pass_id(const KernelGlobals *kg, const ShaderData *sd)
 {
   return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
 /* Cryptomatte ID */
 
-ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -350,7 +400,7 @@ ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).cryptomatte_object;
 }
 
-ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_asset_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -360,42 +410,42 @@ ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int objec
 
 /* Particle data from which object was instanced */
 
-ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
+ccl_device_inline uint particle_index(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).index;
 }
 
-ccl_device float particle_age(KernelGlobals *kg, int particle)
+ccl_device float particle_age(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).age;
 }
 
-ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
+ccl_device float particle_lifetime(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).lifetime;
 }
 
-ccl_device float particle_size(KernelGlobals *kg, int particle)
+ccl_device float particle_size(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).size;
 }
 
-ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
+ccl_device float4 particle_rotation(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).rotation;
 }
 
-ccl_device float3 particle_location(KernelGlobals *kg, int particle)
+ccl_device float3 particle_location(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).location);
 }
 
-ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity);
 }
 
-ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_angular_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity);
 }
@@ -418,7 +468,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 /* Transform ray into object space to enter static object in BVH */
 
 ccl_device_inline float bvh_instance_push(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+    const KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir)
 {
   Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -428,17 +478,18 @@ ccl_device_inline float bvh_instance_push(
   *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit static object in BVH. */
 
-ccl_device_inline float bvh_instance_pop(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+ccl_device_inline float bvh_instance_pop(const KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
   if (t != FLT_MAX) {
     Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@@ -454,7 +505,7 @@ ccl_device_inline float bvh_instance_pop(
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_pop_factor(const KernelGlobals *kg,
                                                int object,
                                                const Ray *ray,
                                                float3 *P,
@@ -473,13 +524,12 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(const KernelGlobals *kg,
                                                  int object,
                                                  const Ray *ray,
                                                  float3 *P,
                                                  float3 *dir,
                                                  float3 *idir,
-                                                 float t,
                                                  Transform *itfm)
 {
   object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -490,16 +540,12 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
   *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit motion blurred object in BVH. */
 
-ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_pop(const KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
@@ -521,7 +567,7 @@ ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_motion_pop_factor(const KernelGlobals *kg,
                                                       int object,
                                                       const Ray *ray,
                                                       float3 *P,
@@ -538,48 +584,11 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
 
 #endif
 
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
 
-#ifdef __KERNEL_OPENCL__
-ccl_device_inline void object_position_transform_addrspace(KernelGlobals *kg,
-                                                           const ShaderData *sd,
-                                                           ccl_addr_space float3 *P)
-{
-  float3 private_P = *P;
-  object_position_transform(kg, sd, &private_P);
-  *P = private_P;
-}
-
-ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
-                                                      const ShaderData *sd,
-                                                      ccl_addr_space float3 *D)
-{
-  float3 private_D = *D;
-  object_dir_transform(kg, sd, &private_D);
-  *D = private_D;
-}
-
-ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
-                                                         const ShaderData *sd,
-                                                         ccl_addr_space float3 *N)
-{
-  float3 private_N = *N;
-  object_normal_transform(kg, sd, &private_N);
-  *N = private_N;
-}
-#endif
-
-#ifndef __KERNEL_OPENCL__
-#  define object_position_transform_auto object_position_transform
-#  define object_dir_transform_auto object_dir_transform
-#  define object_normal_transform_auto object_normal_transform
-#else
-#  define object_position_transform_auto object_position_transform_addrspace
-#  define object_dir_transform_auto object_dir_transform_addrspace
-#  define object_normal_transform_auto object_normal_transform_addrspace
-#endif
+#define object_position_transform_auto object_position_transform
+#define object_dir_transform_auto object_dir_transform
+#define object_normal_transform_auto object_normal_transform
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 9c1768f05db..ce0fc15f196 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -24,6 +24,8 @@
  * language governing permissions and limitations under the Apache License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 typedef struct PatchHandle {
@@ -60,7 +62,7 @@ ccl_device_inline int patch_map_resolve_quadrant(float median, float *u, float *
 /* retrieve PatchHandle from patch coords */
 
 ccl_device_inline PatchHandle
-patch_map_find_patch(KernelGlobals *kg, int object, int patch, float u, float v)
+patch_map_find_patch(const KernelGlobals *kg, int object, int patch, float u, float v)
 {
   PatchHandle handle;
 
@@ -191,7 +193,7 @@ ccl_device_inline void patch_eval_normalize_coords(uint patch_bits, float *u, fl
 
 /* retrieve patch control indices */
 
-ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
+ccl_device_inline int patch_eval_indices(const KernelGlobals *kg,
                                          const PatchHandle *handle,
                                          int channel,
                                          int indices[PATCH_MAX_CONTROL_VERTS])
@@ -208,7 +210,7 @@ ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
 
 /* evaluate patch basis functions */
 
-ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
+ccl_device_inline void patch_eval_basis(const KernelGlobals *kg,
                                         const PatchHandle *handle,
                                         float u,
                                         float v,
@@ -247,7 +249,7 @@ ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
 
 /* generic function for evaluating indices and weights from patch coords */
 
-ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
+ccl_device_inline int patch_eval_control_verts(const KernelGlobals *kg,
                                                int object,
                                                int patch,
                                                float u,
@@ -269,7 +271,7 @@ ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
 
 /* functions for evaluating attributes on patches */
 
-ccl_device float patch_eval_float(KernelGlobals *kg,
+ccl_device float patch_eval_float(const KernelGlobals *kg,
                                   const ShaderData *sd,
                                   int offset,
                                   int patch,
@@ -306,7 +308,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float2 patch_eval_float2(KernelGlobals *kg,
+ccl_device float2 patch_eval_float2(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -343,7 +345,7 @@ ccl_device float2 patch_eval_float2(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float3 patch_eval_float3(KernelGlobals *kg,
+ccl_device float3 patch_eval_float3(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -380,7 +382,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_float4(KernelGlobals *kg,
+ccl_device float4 patch_eval_float4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -417,7 +419,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_uchar4(KernelGlobals *kg,
+ccl_device float4 patch_eval_uchar4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index aeb044c9ad3..ba31b12e817 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -19,6 +19,10 @@
  * Generic functions to look up mesh, curve and volume primitive attributes for
  * shading and render passes. */
 
+#pragma once
+
+#include "kernel/kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Surface Attributes
@@ -27,8 +31,11 @@ CCL_NAMESPACE_BEGIN
  * attributes for performance, mainly for GPU performance to avoid bringing in
  * heavy volume interpolation code. */
 
-ccl_device_inline float primitive_surface_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_inline float primitive_surface_attribute_float(const KernelGlobals *kg,
+                                                          const ShaderData *sd,
+                                                          const AttributeDescriptor desc,
+                                                          float *dx,
+                                                          float *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -50,7 +57,7 @@ ccl_device_inline float primitive_surface_attribute_float(
   }
 }
 
-ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
+ccl_device_inline float2 primitive_surface_attribute_float2(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float2 *dx,
@@ -76,7 +83,7 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_surface_attribute_float3(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float3 *dx,
@@ -102,11 +109,11 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_surface_attribute_float4(KernelGlobals *kg,
-                                                            const ShaderData *sd,
-                                                            const AttributeDescriptor desc,
-                                                            float4 *dx,
-                                                            float4 *dy)
+ccl_device_forceinline float4 primitive_surface_attribute_float4(const KernelGlobals *kg,
+                                                                 const ShaderData *sd,
+                                                                 const AttributeDescriptor desc,
+                                                                 float4 *dx,
+                                                                 float4 *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -141,7 +148,7 @@ ccl_device_inline bool primitive_is_volume_attribute(const ShaderData *sd,
   return sd->type == PRIMITIVE_VOLUME;
 }
 
-ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
+ccl_device_inline float primitive_volume_attribute_float(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          const AttributeDescriptor desc)
 {
@@ -153,7 +160,7 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_volume_attribute_float3(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -165,7 +172,7 @@ ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
+ccl_device_inline float4 primitive_volume_attribute_float4(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -180,7 +187,7 @@ ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
 
 /* Default UV coordinate */
 
-ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 primitive_uv(const KernelGlobals *kg, const ShaderData *sd)
 {
   const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV);
 
@@ -193,7 +200,7 @@ ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 
 /* Ptex coordinates */
 
-ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
+ccl_device bool primitive_ptex(const KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
 {
   /* storing ptex data as attributes is not memory efficient but simple for tests */
   const AttributeDescriptor desc_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID);
@@ -213,7 +220,7 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 
 /* Surface tangent */
 
-ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 primitive_tangent(const KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
   if (sd->type & PRIMITIVE_ALL_CURVE)
@@ -245,7 +252,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 
 /* Motion vector for motion pass */
 
-ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float4 primitive_motion_vector(const KernelGlobals *kg, const ShaderData *sd)
 {
   /* center position */
   float3 center;
diff --git a/intern/cycles/kernel/geom/geom_shader_data.h b/intern/cycles/kernel/geom/geom_shader_data.h
new file mode 100644
index 00000000000..fb2cb5cb1ea
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_shader_data.h
@@ -0,0 +1,373 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Functions to initialize ShaderData given.
+ *
+ * Could be from an incoming ray, intersection or sampled position. */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* ShaderData setup from incoming ray */
+
+#ifdef __OBJECT_MOTION__
+ccl_device void shader_setup_object_transforms(const KernelGlobals *ccl_restrict kg,
+                                               ShaderData *ccl_restrict sd,
+                                               float time)
+{
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
+    sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+  }
+}
+#endif
+
+/* TODO: break this up if it helps reduce register pressure to load data from
+ * global memory as we write it to shaderdata. */
+ccl_device_inline void shader_setup_from_ray(const KernelGlobals *ccl_restrict kg,
+                                             ShaderData *ccl_restrict sd,
+                                             const Ray *ccl_restrict ray,
+                                             const Intersection *ccl_restrict isect)
+{
+  /* Read intersection data into shader globals.
+   *
+   * TODO: this is redundant, could potentially remove some of this from
+   * ShaderData but would need to ensure that it also works for shadow
+   * shader evaluation. */
+  sd->u = isect->u;
+  sd->v = isect->v;
+  sd->ray_length = isect->t;
+  sd->type = isect->type;
+  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
+                                                isect->object;
+  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
+  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+  sd->lamp = LAMP_NONE;
+  sd->flag = 0;
+
+  /* Read matrices and time. */
+  sd->time = ray->time;
+
+#ifdef __OBJECT_MOTION__
+  shader_setup_object_transforms(kg, sd, ray->time);
+#endif
+
+  /* Read ray data into shader globals. */
+  sd->I = -ray->D;
+
+#ifdef __HAIR__
+  if (sd->type & PRIMITIVE_ALL_CURVE) {
+    /* curve */
+    curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+  }
+  else
+#endif
+      if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* static triangle */
+    float3 Ng = triangle_normal(kg, sd);
+    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+
+    /* vectors */
+    sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+    sd->Ng = Ng;
+    sd->N = Ng;
+
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL)
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+#ifdef __DPDU__
+    /* dPdu/dPdv */
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+#endif
+  }
+  else {
+    /* motion triangle */
+    motion_triangle_shader_setup(
+        kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
+  }
+
+  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+
+  if (isect->object != OBJECT_NONE) {
+    /* instance transform */
+    object_normal_transform_auto(kg, sd, &sd->N);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+#ifdef __DPDU__
+    object_dir_transform_auto(kg, sd, &sd->dPdu);
+    object_dir_transform_auto(kg, sd, &sd->dPdv);
+#endif
+  }
+
+  /* backfacing test */
+  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+  if (backfacing) {
+    sd->flag |= SD_BACKFACING;
+    sd->Ng = -sd->Ng;
+    sd->N = -sd->N;
+#ifdef __DPDU__
+    sd->dPdu = -sd->dPdu;
+    sd->dPdv = -sd->dPdv;
+#endif
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  differential_transfer_compact(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, sd->ray_length);
+  differential_incoming_compact(&sd->dI, ray->D, ray->dD);
+  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+#endif
+}
+
+/* ShaderData setup from position sampled on mesh */
+
+ccl_device_inline void shader_setup_from_sample(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const float3 P,
+                                                const float3 Ng,
+                                                const float3 I,
+                                                int shader,
+                                                int object,
+                                                int prim,
+                                                float u,
+                                                float v,
+                                                float t,
+                                                float time,
+                                                bool object_space,
+                                                int lamp)
+{
+  /* vectors */
+  sd->P = P;
+  sd->N = Ng;
+  sd->Ng = Ng;
+  sd->I = I;
+  sd->shader = shader;
+  if (prim != PRIM_NONE)
+    sd->type = PRIMITIVE_TRIANGLE;
+  else if (lamp != LAMP_NONE)
+    sd->type = PRIMITIVE_LAMP;
+  else
+    sd->type = PRIMITIVE_NONE;
+
+  /* primitive */
+  sd->object = object;
+  sd->lamp = LAMP_NONE;
+  /* Currently no access to bvh prim index for strand sd->prim. */
+  sd->prim = prim;
+  sd->u = u;
+  sd->v = v;
+  sd->time = time;
+  sd->ray_length = t;
+
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  if (sd->object != OBJECT_NONE) {
+    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
+
+#ifdef __OBJECT_MOTION__
+    shader_setup_object_transforms(kg, sd, time);
+#endif
+  }
+  else if (lamp != LAMP_NONE) {
+    sd->lamp = lamp;
+  }
+
+  /* transform into world space */
+  if (object_space) {
+    object_position_transform_auto(kg, sd, &sd->P);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+    sd->N = sd->Ng;
+    object_dir_transform_auto(kg, sd, &sd->I);
+  }
+
+  if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL) {
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+        object_normal_transform_auto(kg, sd, &sd->N);
+      }
+    }
+
+    /* dPdu/dPdv */
+#ifdef __DPDU__
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+
+    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+      object_dir_transform_auto(kg, sd, &sd->dPdu);
+      object_dir_transform_auto(kg, sd, &sd->dPdv);
+    }
+#endif
+  }
+  else {
+#ifdef __DPDU__
+    sd->dPdu = zero_float3();
+    sd->dPdv = zero_float3();
+#endif
+  }
+
+  /* backfacing test */
+  if (sd->prim != PRIM_NONE) {
+    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+    if (backfacing) {
+      sd->flag |= SD_BACKFACING;
+      sd->Ng = -sd->Ng;
+      sd->N = -sd->N;
+#ifdef __DPDU__
+      sd->dPdu = -sd->dPdu;
+      sd->dPdv = -sd->dPdv;
+#endif
+    }
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* no ray differentials here yet */
+  sd->dP = differential3_zero();
+  sd->dI = differential3_zero();
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup for displacement */
+
+ccl_device void shader_setup_from_displace(const KernelGlobals *ccl_restrict kg,
+                                           ShaderData *ccl_restrict sd,
+                                           int object,
+                                           int prim,
+                                           float u,
+                                           float v)
+{
+  float3 P, Ng, I = zero_float3();
+  int shader;
+
+  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
+
+  /* force smooth shading for displacement */
+  shader |= SHADER_SMOOTH_NORMAL;
+
+  shader_setup_from_sample(
+      kg,
+      sd,
+      P,
+      Ng,
+      I,
+      shader,
+      object,
+      prim,
+      u,
+      v,
+      0.0f,
+      0.5f,
+      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
+      LAMP_NONE);
+}
+
+/* ShaderData setup from ray into background */
+
+ccl_device_inline void shader_setup_from_background(const KernelGlobals *ccl_restrict kg,
+                                                    ShaderData *ccl_restrict sd,
+                                                    const float3 ray_P,
+                                                    const float3 ray_D,
+                                                    const float ray_time)
+{
+  /* for NDC coordinates */
+  sd->ray_P = ray_P;
+
+  /* vectors */
+  sd->P = ray_D;
+  sd->N = -ray_D;
+  sd->Ng = -ray_D;
+  sd->I = -ray_D;
+  sd->shader = kernel_data.background.surface_shader;
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  sd->time = ray_time;
+  sd->ray_length = 0.0f;
+
+  sd->object = OBJECT_NONE;
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#endif
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO: ray->dP */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup from point inside volume */
+
+#ifdef __VOLUME__
+ccl_device_inline void shader_setup_from_volume(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const Ray *ccl_restrict ray)
+{
+
+  /* vectors */
+  sd->P = ray->P;
+  sd->N = -ray->D;
+  sd->Ng = -ray->D;
+  sd->I = -ray->D;
+  sd->shader = SHADER_NONE;
+  sd->flag = 0;
+  sd->object_flag = 0;
+  sd->time = ray->time;
+  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
+
+  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->type = PRIMITIVE_VOLUME;
+
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#  ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#  endif
+
+#  ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO ray->dD */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#  endif
+
+  /* for NDC coordinates */
+  sd->ray_P = ray->P;
+  sd->ray_dP = ray->dP;
+}
+#endif /* __VOLUME__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 9eceb996926..877b2ece15b 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -16,18 +16,20 @@
 
 /* Functions for retrieving attributes on triangles produced from subdivision meshes */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Patch index for triangle, -1 if not subdivision triangle */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd)
 {
   return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
-ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
+ccl_device_inline void subd_triangle_patch_uv(const KernelGlobals *kg,
                                               const ShaderData *sd,
                                               float2 uv[3])
 {
@@ -40,7 +42,7 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
 
 /* Vertex indices of patch */
 
-ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch)
+ccl_device_inline uint4 subd_triangle_patch_indices(const KernelGlobals *kg, int patch)
 {
   uint4 indices;
 
@@ -54,21 +56,23 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch
 
 /* Originating face for patch */
 
-ccl_device_inline uint subd_triangle_patch_face(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_face(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 4);
 }
 
 /* Number of corners on originating face */
 
-ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_num_corners(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 5) & 0xffff;
 }
 
 /* Indices of the four corners that are used by the patch */
 
-ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, int corners[4])
+ccl_device_inline void subd_triangle_patch_corners(const KernelGlobals *kg,
+                                                   int patch,
+                                                   int corners[4])
 {
   uint4 data;
 
@@ -99,8 +103,11 @@ ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch,
 
 /* Reading attributes on various subdivision triangle elements */
 
-ccl_device_noinline float subd_triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_noinline float subd_triangle_attribute_float(const KernelGlobals *kg,
+                                                        const ShaderData *sd,
+                                                        const AttributeDescriptor desc,
+                                                        float *dx,
+                                                        float *dy)
 {
   int patch = subd_triangle_patch(kg, sd);
 
@@ -235,7 +242,7 @@ ccl_device_noinline float subd_triangle_attribute_float(
   }
 }
 
-ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
+ccl_device_noinline float2 subd_triangle_attribute_float2(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float2 *dx,
@@ -378,7 +385,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
+ccl_device_noinline float3 subd_triangle_attribute_float3(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float3 *dx,
@@ -520,7 +527,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg,
+ccl_device_noinline float4 subd_triangle_attribute_float4(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index ff7909ca425..910fb122c6d 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -20,10 +20,12 @@
  * ray intersection we use a precomputed triangle storage to accelerate
  * intersection at the cost of more memory usage */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Normal on triangle. */
-ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 triangle_normal(const KernelGlobals *kg, ShaderData *sd)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
@@ -41,8 +43,14 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 }
 
 /* Point and normal on triangle. */
-ccl_device_inline void triangle_point_normal(
-    KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(const KernelGlobals *kg,
+                                             int object,
+                                             int prim,
+                                             float u,
+                                             float v,
+                                             float3 *P,
+                                             float3 *Ng,
+                                             int *shader)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -67,7 +75,7 @@ ccl_device_inline void triangle_point_normal(
 
 /* Triangle vertex locations */
 
-ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
+ccl_device_inline void triangle_vertices(const KernelGlobals *kg, int prim, float3 P[3])
 {
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
   P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w + 0));
@@ -77,7 +85,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Triangle vertex locations and vertex normals */
 
-ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
+ccl_device_inline void triangle_vertices_and_normals(const KernelGlobals *kg,
                                                      int prim,
                                                      float3 P[3],
                                                      float3 N[3])
@@ -94,7 +102,7 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
 /* Interpolate smooth vertex normal from vertices */
 
 ccl_device_inline float3
-triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
+triangle_smooth_normal(const KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -108,7 +116,7 @@ triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 }
 
 ccl_device_inline float3 triangle_smooth_normal_unnormalized(
-    KernelGlobals *kg, ShaderData *sd, float3 Ng, int prim, float u, float v)
+    const KernelGlobals *kg, const ShaderData *sd, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -130,7 +138,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
+ccl_device_inline void triangle_dPdudv(const KernelGlobals *kg,
                                        int prim,
                                        ccl_addr_space float3 *dPdu,
                                        ccl_addr_space float3 *dPdv)
@@ -148,8 +156,11 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
 
 /* Reading attributes on various triangle elements */
 
-ccl_device float triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float triangle_attribute_float(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const AttributeDescriptor desc,
+                                          float *dx,
+                                          float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION | ATTR_ELEMENT_CORNER)) {
     float f0, f1, f2;
@@ -195,7 +206,7 @@ ccl_device float triangle_attribute_float(
   }
 }
 
-ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
+ccl_device float2 triangle_attribute_float2(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float2 *dx,
@@ -245,7 +256,7 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
+ccl_device float3 triangle_attribute_float3(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float3 *dx,
@@ -295,7 +306,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
+ccl_device float4 triangle_attribute_float4(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index b0cce274b94..30b77ebd2eb 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -20,12 +20,17 @@
  * intersection at the cost of more memory usage.
  */
 
+#pragma once
+
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect(const KernelGlobals *kg,
                                           Intersection *isect,
                                           float3 P,
                                           float3 dir,
+                                          float tmax,
                                           uint visibility,
                                           int object,
                                           int prim_addr)
@@ -41,7 +46,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              ssef_verts,
 #else
@@ -78,7 +83,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
  */
 
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect_local(const KernelGlobals *kg,
                                                 LocalIntersection *local_isect,
                                                 float3 P,
                                                 float3 dir,
@@ -192,25 +197,20 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
  * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
  */
 
-ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine(const KernelGlobals *kg,
                                          ShaderData *sd,
-                                         const Intersection *isect,
-                                         const Ray *ray)
+                                         float3 P,
+                                         float3 D,
+                                         float t,
+                                         const int isect_object,
+                                         const int isect_prim)
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -219,7 +219,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
   P = P + D * t;
 
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -239,13 +239,8 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
     P = P + D * rt;
   }
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -255,28 +250,23 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space for
+/* Same as above, except that t is assumed to be in object space for
  * instancing.
  */
-ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine_local(const KernelGlobals *kg,
                                                ShaderData *sd,
-                                               const Intersection *isect,
-                                               const Ray *ray)
+                                               float3 P,
+                                               float3 D,
+                                               float t,
+                                               const int isect_object,
+                                               const int isect_prim)
 {
 #ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return triangle_refine(kg, sd, isect, ray);
+  /* t is always in world space with OptiX. */
+  return triangle_refine(kg, sd, P, D, t, isect_object, isect_prim);
 #else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -286,7 +276,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   P = P + D * t;
 
 #  ifdef __INTERSECTION_REFINE__
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -307,13 +297,8 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   }
 #  endif /* __INTERSECTION_REFINE__ */
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 809b76245ba..2bcd7e56b5f 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -23,13 +23,15 @@
  * 3D voxel textures can be assigned as attributes per mesh, which means the
  * same shader can be used for volume objects with different densities, etc. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME__
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
+ccl_device_inline float3 volume_normalized_position(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 P)
 {
@@ -68,7 +70,7 @@ ccl_device float3 volume_attribute_value_to_float3(const float4 value)
   }
 }
 
-ccl_device float4 volume_attribute_float4(KernelGlobals *kg,
+ccl_device float4 volume_attribute_float4(const KernelGlobals *kg,
                                           const ShaderData *sd,
                                           const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
new file mode 100644
index 00000000000..4898ff936c6
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/geom/geom.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* This helps with AA but it's not the real solution as it does not AA the geometry
+ * but it's better than nothing, thus committed. */
+ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
+{
+  /* use mirror repeat (like opengl texture) so that if the barycentric
+   * coordinate goes past the end of the triangle it is not always clamped
+   * to the same value, gives ugly patterns */
+  u /= max;
+  float fu = floorf(u);
+  u = u - fu;
+
+  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_bake(INTEGRATOR_STATE_ARGS,
+                                          const ccl_global KernelWorkTile *ccl_restrict tile,
+                                          ccl_global float *render_buffer,
+                                          const int x,
+                                          const int y,
+                                          const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Always count the sample, even if the camera sample will reject the ray. */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Setup render buffers. */
+  const int index = INTEGRATOR_STATE(path, render_pixel_index);
+  const int pass_stride = kernel_data.film.pass_stride;
+  render_buffer += index * pass_stride;
+
+  ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential;
+
+  const int seed = __float_as_uint(primitive[0]);
+  int prim = __float_as_uint(primitive[1]);
+  if (prim == -1) {
+    return false;
+  }
+
+  prim += kernel_data.bake.tri_offset;
+
+  /* Random number generator. */
+  const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
+
+  float filter_x, filter_y;
+  if (sample == 0) {
+    filter_x = filter_y = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Barycentric UV with sub-pixel offset. */
+  float u = primitive[2];
+  float v = primitive[3];
+
+  float dudx = differential[0];
+  float dudy = differential[1];
+  float dvdx = differential[2];
+  float dvdy = differential[3];
+
+  if (sample > 0) {
+    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
+    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
+                                 1.0f - u);
+  }
+
+  /* Position and normal on triangle. */
+  float3 P, Ng;
+  int shader;
+  triangle_point_normal(kg, kernel_data.bake.object_index, prim, u, v, &P, &Ng, &shader);
+  shader &= SHADER_MASK;
+
+  if (kernel_data.film.pass_background != PASS_UNUSED) {
+    /* Environment baking. */
+
+    /* Setup and write ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = zero_float3();
+    ray.D = normalize(P);
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup next kernel to execute. */
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  }
+  else {
+    /* Surface baking. */
+
+    /* Setup ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = P + Ng;
+    ray.D = -Ng;
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+
+    /* Setup differentials. */
+    float3 dPdu, dPdv;
+    triangle_dPdudv(kg, prim, &dPdu, &dPdv);
+    differential3 dP;
+    dP.dx = dPdu * dudx + dPdv * dvdx;
+    dP.dy = dPdu * dudy + dPdv * dvdy;
+    ray.dP = differential_make_compact(dP);
+    ray.dD = differential_zero_compact();
+
+    /* Write ray. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup and write intersection. */
+    Intersection isect ccl_optional_struct_init;
+    isect.object = kernel_data.bake.object_index;
+    isect.prim = prim;
+    isect.u = u;
+    isect.v = v;
+    isect.t = 1.0f;
+    isect.type = PRIMITIVE_TRIANGLE;
+#ifdef __EMBREE__
+    isect.Ng = Ng;
+#endif
+    integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+    /* Setup next kernel to execute. */
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_camera.h b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
new file mode 100644
index 00000000000..58e7bde4c94
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_camera_sample(const KernelGlobals *ccl_restrict kg,
+                                               const int sample,
+                                               const int x,
+                                               const int y,
+                                               const uint rng_hash,
+                                               Ray *ray)
+{
+  /* Filter sampling. */
+  float filter_u, filter_v;
+
+  if (sample == 0) {
+    filter_u = 0.5f;
+    filter_v = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v);
+  }
+
+  /* Depth of field sampling. */
+  float lens_u = 0.0f, lens_v = 0.0f;
+  if (kernel_data.cam.aperturesize > 0.0f) {
+    path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v);
+  }
+
+  /* Motion blur time sampling. */
+  float time = 0.0f;
+#ifdef __CAMERA_MOTION__
+  if (kernel_data.cam.shuttertime != -1.0f)
+    time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME);
+#endif
+
+  /* Generate camera ray. */
+  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_camera(INTEGRATOR_STATE_ARGS,
+                                            const ccl_global KernelWorkTile *ccl_restrict tile,
+                                            ccl_global float *render_buffer,
+                                            const int x,
+                                            const int y,
+                                            const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Count the sample and get an effective sample for this pixel.
+   *
+   * This logic allows to both count actual number of samples per pixel, and to add samples to this
+   * pixel after it was converged and samples were added somewhere else (in which case the
+   * `scheduled_sample` will be different from actual number of samples in this pixel). */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Initialize random number seed for path. */
+  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
+
+  {
+    /* Generate camera ray. */
+    Ray ray;
+    integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
+    if (ray.t == 0.0f) {
+      return true;
+    }
+
+    /* Write camera ray to state. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Continue with intersect_closest kernel, optionally initializing volume
+   * stack before that if the camera may be inside a volume. */
+  if (kernel_data.cam.is_inside_volume) {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+  }
+  else {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
new file mode 100644
index 00000000000..34ca6814534
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/bvh/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+template<uint32_t current_kernel>
+ccl_device_forceinline bool integrator_intersect_terminate(INTEGRATOR_STATE_ARGS,
+                                                           const int shader_flags)
+{
+
+  /* Optional AO bounce termination.
+   * We continue evaluating emissive/transparent surfaces and volumes, similar
+   * to direct lighting. Only if we know there are none can we terminate the
+   * path immediately. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    if (shader_flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    }
+    else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_VOLUME;
+    }
+    else {
+      return true;
+    }
+  }
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* We perform path termination in this kernel to avoid launching shade_surface
+   * and evaluating the shader when not needed. Only for emission and transparent
+   * surfaces in front of emission do we need to evaluate the shader, since we
+   * perform MIS as part of indirect rays. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = path_state_continuation_probability(INTEGRATOR_STATE_PASS, path_flag);
+
+  if (probability != 1.0f) {
+    const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
+
+    if (probability == 0.0f || terminate >= probability) {
+      if (shader_flags & SD_HAS_EMISSION) {
+        /* Mark path to be terminated right after shader evaluation on the surface. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
+      }
+      else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        /* TODO: only do this for emissive volumes. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IN_NEXT_VOLUME;
+      }
+      else {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/* Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_shader_next_kernel(
+    INTEGRATOR_STATE_ARGS,
+    const Intersection *ccl_restrict isect,
+    const int shader,
+    const int shader_flags)
+{
+  /* Note on scheduling.
+   *
+   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
+   * or without raytrace support, depending on the shader used.
+   *
+   * When there is a shadow catcher split the general idea is to have the following configuration:
+   *
+   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
+   *    will trace shadow catcher object.
+   *
+   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
+   *    the matte ray.
+   *
+   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
+   *    on. The background kernel will then schedule surface shading for the matte ray.
+   *
+   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
+   * the matte path. */
+
+  const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
+                                    (kernel_data.film.pass_ao != PASS_UNUSED));
+
+  if (use_raytrace_kernel) {
+    INTEGRATOR_PATH_NEXT_SORTED(
+        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+  }
+
+#ifdef __SHADOW_CATCHER__
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags)) {
+    if (kernel_data.film.use_approximate_shadow_catcher && !kernel_data.background.transparent) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+      if (use_raytrace_kernel) {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+      else {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+    }
+    else if (use_raytrace_kernel) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    }
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+  kernel_assert(ray.t != 0.0f);
+
+  const uint visibility = path_state_ray_visibility(INTEGRATOR_STATE_PASS);
+  const int last_isect_prim = INTEGRATOR_STATE(isect, prim);
+  const int last_isect_object = INTEGRATOR_STATE(isect, object);
+
+  /* Trick to use short AO rays to approximate indirect light at the end of the path. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+
+    const int last_object = last_isect_object != OBJECT_NONE ?
+                                last_isect_object :
+                                kernel_tex_fetch(__prim_object, last_isect_prim);
+    const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
+    if (object_ao_distance != 0.0f) {
+      ray.t = object_ao_distance;
+    }
+  }
+
+  /* Scene Intersection. */
+  Intersection isect ccl_optional_struct_init;
+  bool hit = scene_intersect(kg, &ray, visibility, &isect);
+
+  /* TODO: remove this and do it in the various intersection functions instead. */
+  if (!hit) {
+    isect.prim = PRIM_NONE;
+  }
+
+  /* Light intersection for MIS. */
+  if (kernel_data.integrator.use_lamp_mis) {
+    /* NOTE: if we make lights visible to camera rays, we'll need to initialize
+     * these in the path_state_init. */
+    const int last_type = INTEGRATOR_STATE(isect, type);
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+
+    hit = lights_intersect(
+              kg, &ray, &isect, last_isect_prim, last_isect_object, last_type, path_flag) ||
+          hit;
+  }
+
+  /* Write intersection result into global integrator state memory. */
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, flags)) {
+      /* Continue with volume kernel if we are inside a volume, regardless
+       * if we hit anything. */
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    }
+    else {
+      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+    return;
+  }
+#endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+              INTEGRATOR_STATE_PASS, flags)) {
+        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, &isect, shader, flags);
+        return;
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+        return;
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
new file mode 100644
index 00000000000..5bd9cfda4a4
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Visibility for the shadow ray. */
+ccl_device_forceinline uint integrate_intersect_shadow_visibility(INTEGRATOR_STATE_CONST_ARGS)
+{
+  uint visibility = PATH_RAY_SHADOW;
+
+#ifdef __SHADOW_CATCHER__
+  const uint32_t path_flag = INTEGRATOR_STATE(shadow_path, flag);
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+#endif
+
+  return visibility;
+}
+
+ccl_device bool integrate_intersect_shadow_opaque(INTEGRATOR_STATE_ARGS,
+                                                  const Ray *ray,
+                                                  const uint visibility)
+{
+  /* Mask which will pick only opaque visibility bits from the `visibility`.
+   * Calculate the mask at compile time: the visibility will either be a high bits for the shadow
+   * catcher objects, or lower bits for the regular objects (there is no need to check the path
+   * state here again). */
+  constexpr const uint opaque_mask = SHADOW_CATCHER_VISIBILITY_SHIFT(PATH_RAY_SHADOW_OPAQUE) |
+                                     PATH_RAY_SHADOW_OPAQUE;
+
+  Intersection isect;
+  const bool opaque_hit = scene_intersect(kg, ray, visibility & opaque_mask, &isect);
+
+  if (!opaque_hit) {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+
+ccl_device_forceinline int integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_CONST_ARGS)
+{
+  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+  const int transparent_bounce = INTEGRATOR_STATE(shadow_path, transparent_bounce);
+
+  return max(transparent_max_bounce - transparent_bounce - 1, 0);
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ray,
+                                                       const uint visibility)
+{
+  Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE];
+
+  /* Limit the number hits to the max transparent bounces allowed and the size that we
+   * have available in the integrator state. */
+  const uint max_transparent_hits = integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_PASS);
+  const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE);
+  uint num_hits = 0;
+  bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits);
+
+  /* If number of hits exceed the transparent bounces limit, make opaque. */
+  if (num_hits > max_transparent_hits) {
+    opaque_hit = true;
+  }
+
+  if (!opaque_hit) {
+    uint num_recorded_hits = min(num_hits, max_hits);
+
+    if (num_recorded_hits > 0) {
+      sort_intersections(isect, num_recorded_hits);
+
+      /* Write intersection result into global integrator state memory. */
+      for (int hit = 0; hit < num_recorded_hits; hit++) {
+        integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit);
+      }
+    }
+
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = num_hits;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+#endif
+
+ccl_device void integrator_intersect_shadow(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Compute visibility. */
+  const uint visibility = integrate_intersect_shadow_visibility(INTEGRATOR_STATE_PASS);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* TODO: compile different kernels depending on this? Especially for OptiX
+   * conditional trace calls are bad. */
+  const bool opaque_hit =
+      (kernel_data.integrator.transparent_shadows) ?
+          integrate_intersect_shadow_transparent(INTEGRATOR_STATE_PASS, &ray, visibility) :
+          integrate_intersect_shadow_opaque(INTEGRATOR_STATE_PASS, &ray, visibility);
+#else
+  const bool opaque_hit = integrate_intersect_shadow_opaque(
+      INTEGRATOR_STATE_PASS, &ray, visibility);
+#endif
+
+  if (opaque_hit) {
+    /* Hit an opaque surface, shadow path ends here. */
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    /* Hit nothing or transparent surfaces, continue to shadow kernel
+     * for shading and render buffer output.
+     *
+     * TODO: could also write to render buffer directly if no transparent shadows?
+     * Could save a kernel execution for the common case. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
index c10ecc426c6..7c090952dc7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
+++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,23 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
+#pragma once
 
-__kernel void kernel_ocl_path_trace_state_buffer_size(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        uint num_threads,
-        ccl_global uint64_t *size)
+#include "kernel/integrator/integrator_subsurface.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS)
 {
-	((KernelGlobals*)kg)->data = data;
-	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SUBSURFACE);
+
+#ifdef __SUBSURFACE__
+  if (subsurface_scatter(INTEGRATOR_STATE_PASS)) {
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
new file mode 100644
index 00000000000..60d8a8e3e54
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_ARGS,
+                                                              const float3 from_P,
+                                                              const float3 to_P)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  kernel_assert(kernel_data.integrator.use_volumes);
+
+  Ray volume_ray ccl_optional_struct_init;
+  volume_ray.P = from_P;
+  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+  if (num_hits > 0) {
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+    }
+  }
+#else
+  Intersection isect;
+  int step = 0;
+  while (step < 2 * VOLUME_STACK_SIZE &&
+         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    if (volume_ray.t != FLT_MAX) {
+      volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
+    }
+    ++step;
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  Ray volume_ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &volume_ray);
+  volume_ray.t = FLT_MAX;
+
+  const uint visibility = (INTEGRATOR_STATE(path, flag) & PATH_RAY_ALL_VISIBILITY);
+  int stack_index = 0, enclosed_index = 0;
+
+  /* Write background shader. */
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
+    integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+    stack_index++;
+  }
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+  if (num_hits > 0) {
+    int enclosed_volumes[VOLUME_STACK_SIZE];
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      if (stack_sd->flag & SD_BACKFACING) {
+        bool need_add = true;
+        for (int i = 0; i < enclosed_index && need_add; ++i) {
+          /* If ray exited the volume and never entered to that volume
+           * it means that camera is inside such a volume.
+           */
+          if (enclosed_volumes[i] == stack_sd->object) {
+            need_add = false;
+          }
+        }
+        for (int i = 0; i < stack_index && need_add; ++i) {
+          /* Don't add intersections twice. */
+          VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+          if (entry.object == stack_sd->object) {
+            need_add = false;
+            break;
+          }
+        }
+        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+          const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+          integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+          ++stack_index;
+        }
+      }
+      else {
+        /* If ray from camera enters the volume, this volume shouldn't
+         * be added to the stack on exit.
+         */
+        enclosed_volumes[enclosed_index++] = stack_sd->object;
+      }
+    }
+  }
+#else
+  int enclosed_volumes[VOLUME_STACK_SIZE];
+  int step = 0;
+
+  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
+         step < 2 * VOLUME_STACK_SIZE) {
+    Intersection isect;
+    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
+      break;
+    }
+
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    if (stack_sd->flag & SD_BACKFACING) {
+      /* If ray exited the volume and never entered to that volume
+       * it means that camera is inside such a volume.
+       */
+      bool need_add = true;
+      for (int i = 0; i < enclosed_index && need_add; ++i) {
+        /* If ray exited the volume and never entered to that volume
+         * it means that camera is inside such a volume.
+         */
+        if (enclosed_volumes[i] == stack_sd->object) {
+          need_add = false;
+        }
+      }
+      for (int i = 0; i < stack_index && need_add; ++i) {
+        /* Don't add intersections twice. */
+        VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+        if (entry.object == stack_sd->object) {
+          need_add = false;
+          break;
+        }
+      }
+      if (need_add) {
+        const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+        ++stack_index;
+      }
+    }
+    else {
+      /* If ray from camera enters the volume, this volume shouldn't
+       * be added to the stack on exit.
+       */
+      enclosed_volumes[enclosed_index++] = stack_sd->object;
+    }
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    ++step;
+  }
+#endif
+
+  /* Write terminator. */
+  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
+  integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+
+  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
new file mode 100644
index 00000000000..91363ea1c7f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_megakernel(INTEGRATOR_STATE_ARGS,
+                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Each kernel indicates the next kernel to execute, so here we simply
+   * have to check what that kernel is and execute it.
+   *
+   * TODO: investigate if we can use device side enqueue for GPUs to avoid
+   * having to compile this big kernel. */
+  while (true) {
+    if (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+      /* First handle any shadow paths before we potentially create more shadow paths. */
+      switch (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+          integrator_intersect_shadow(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+          integrator_shade_shadow(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else if (INTEGRATOR_STATE(path, queued_kernel)) {
+      /* Then handle regular path kernels. */
+      switch (INTEGRATOR_STATE(path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+          integrator_intersect_closest(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+          integrator_shade_background(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+          integrator_shade_surface(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+          integrator_shade_volume(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+          integrator_shade_surface_raytrace(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+          integrator_shade_light(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+          integrator_intersect_subsurface(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+          integrator_intersect_volume_stack(INTEGRATOR_STATE_PASS);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else {
+      break;
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
new file mode 100644
index 00000000000..3e4cc837e9b
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float3 integrator_eval_background_shader(INTEGRATOR_STATE_ARGS,
+                                                    ccl_global float *ccl_restrict render_buffer)
+{
+#ifdef __BACKGROUND__
+  const int shader = kernel_data.background.surface_shader;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Use visibility flag to skip lights. */
+  if (shader & SHADER_EXCLUDE_ANY) {
+    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((shader & SHADER_EXCLUDE_GLOSSY) && ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+                                              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+        ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return zero_float3();
+  }
+
+  /* Use fast constant background color if available. */
+  float3 L = zero_float3();
+  if (!shader_constant_emission_eval(kg, shader, &L)) {
+    /* Evaluate background shader. */
+
+    /* TODO: does aliasing like this break automatic SoA in CUDA?
+     * Should we instead store closures separate from ShaderData? */
+    ShaderDataTinyStorage emission_sd_storage;
+    ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
+    shader_setup_from_background(kg,
+                                 emission_sd,
+                                 INTEGRATOR_STATE(ray, P),
+                                 INTEGRATOR_STATE(ray, D),
+                                 INTEGRATOR_STATE(ray, time));
+
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
+
+    L = shader_background_eval(emission_sd);
+  }
+
+  /* Background MIS weights. */
+#  ifdef __BACKGROUND_MIS__
+  /* Check if background light exists or if we should skip pdf. */
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
+    const float3 ray_P = INTEGRATOR_STATE(ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(ray, D);
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* multiple importance sampling, get background light pdf for ray
+     * direction, and compute weight with respect to BSDF pdf */
+    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
+    const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
+
+    L *= mis_weight;
+  }
+#  endif
+
+  return L;
+#else
+  return make_float3(0.8f, 0.8f, 0.8f);
+#endif
+}
+
+ccl_device_inline void integrate_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  /* Accumulate transparency for transparent background. We can skip background
+   * shader evaluation unless a background pass is used. */
+  bool eval_background = true;
+  float transparent = 0.0f;
+
+  const bool is_transparent_background_ray = kernel_data.background.transparent &&
+                                             (INTEGRATOR_STATE(path, flag) &
+                                              PATH_RAY_TRANSPARENT_BACKGROUND);
+
+  if (is_transparent_background_ray) {
+    transparent = average(INTEGRATOR_STATE(path, throughput));
+
+#ifdef __PASSES__
+    eval_background = (kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND));
+#else
+    eval_background = false;
+#endif
+  }
+
+  /* Evaluate background shader. */
+  float3 L = (eval_background) ?
+                 integrator_eval_background_shader(INTEGRATOR_STATE_PASS, render_buffer) :
+                 zero_float3();
+
+  /* When using the ao bounces approximation, adjust background
+   * shader intensity with ao factor. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    L *= kernel_data.integrator.ao_bounces_factor;
+  }
+
+  /* Write to render buffer. */
+  kernel_accum_background(
+      INTEGRATOR_STATE_PASS, L, transparent, is_transparent_background_ray, render_buffer);
+}
+
+ccl_device_inline void integrate_distant_lights(INTEGRATOR_STATE_ARGS,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+  LightSample ls ccl_optional_struct_init;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    if (light_sample_from_distant_ray(kg, ray_D, lamp, &ls)) {
+      /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+      const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+      if (ls.shader & SHADER_EXCLUDE_ANY) {
+        if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+            ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+             ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+            ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+            ((ls.shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+            ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+          return;
+      }
+#endif
+
+      /* Evaluate light shader. */
+      /* TODO: does aliasing like this break automatic SoA in CUDA? */
+      ShaderDataTinyStorage emission_sd_storage;
+      ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+      float3 light_eval = light_sample_shader_eval(
+          INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+      if (is_zero(light_eval)) {
+        return;
+      }
+
+      /* MIS weighting. */
+      if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+        /* multiple importance sampling, get regular light pdf,
+         * and compute weight with respect to BSDF pdf */
+        const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+        const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+        light_eval *= mis_weight;
+      }
+
+      /* Write to render buffer. */
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+    }
+  }
+}
+
+ccl_device void integrator_shade_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  /* TODO: unify these in a single loop to only have a single shader evaluation call. */
+  integrate_distant_lights(INTEGRATOR_STATE_PASS, render_buffer);
+  integrate_background(INTEGRATOR_STATE_PASS, render_buffer);
+
+#ifdef __SHADOW_CATCHER__
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+    const int isect_prim = INTEGRATOR_STATE(isect, prim);
+    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim);
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                  shader);
+    }
+    else {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                  shader);
+    }
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_light.h b/intern/cycles/kernel/integrator/integrator_shade_light.h
new file mode 100644
index 00000000000..05b530f9665
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_light.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  /* Setup light sample. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  float3 ray_P = INTEGRATOR_STATE(ray, P);
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+
+  /* Advance ray beyond light. */
+  /* TODO: can we make this more numerically robust to avoid reintersecting the
+   * same light in some cases? */
+  const float3 new_ray_P = ray_offset(ray_P + ray_D * isect.t, ray_D);
+  INTEGRATOR_STATE_WRITE(ray, P) = new_ray_P;
+  INTEGRATOR_STATE_WRITE(ray, t) -= isect.t;
+
+  /* Set position to where the BSDF was sampled, for correct MIS PDF. */
+  const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+  ray_P -= ray_D * mis_ray_t;
+  isect.t += mis_ray_t;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = mis_ray_t + isect.t;
+
+  LightSample ls ccl_optional_struct_init;
+  const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
+
+  if (!use_light_sample) {
+    return;
+  }
+
+  /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (ls.shader & SHADER_EXCLUDE_ANY) {
+    if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+         ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return;
+  }
+#endif
+
+  /* Evaluate light shader. */
+  /* TODO: does aliasing like this break automatic SoA in CUDA? */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  float3 light_eval = light_sample_shader_eval(INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* MIS weighting. */
+  if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+    /* multiple importance sampling, get regular light pdf,
+     * and compute weight with respect to BSDF pdf */
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+    light_eval *= mis_weight;
+  }
+
+  /* Write to render buffer. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+}
+
+ccl_device void integrator_shade_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  integrate_light(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* TODO: we could get stuck in an infinite loop if there are precision issues
+   * and the same light is hit again.
+   *
+   * As a workaround count this as a transparent bounce. It makes some sense
+   * to interpret lights as transparent surfaces (and support making them opaque),
+   * but this needs to be revisited. */
+  uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+
+  if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    return;
+  }
+  else {
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+
+  /* TODO: in some cases we could continue directly to SHADE_BACKGROUND, but
+   * probably that optimization is probably not practical if we add lights to
+   * scene geometry. */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
new file mode 100644
index 00000000000..fd3c3ae1653
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_shade_volume.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits)
+{
+  return num_hits >= INTEGRATOR_SHADOW_ISECT_SIZE;
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device_inline float3 integrate_transparent_surface_shadow(INTEGRATOR_STATE_ARGS, const int hit)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
+
+  /* TODO: does aliasing like this break automatic SoA in CUDA?
+   * Should we instead store closures separate from ShaderData?
+   *
+   * TODO: is it better to declare this outside the loop or keep it local
+   * so the compiler can see there is no dependency between iterations? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data at surface. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_shadow_isect(INTEGRATOR_STATE_PASS, &isect, hit);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, shadow_sd, &ray, &isect);
+
+  /* Evaluate shader. */
+  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+        INTEGRATOR_STATE_PASS, shadow_sd, NULL, PATH_RAY_SHADOW);
+  }
+
+#  ifdef __VOLUME__
+  /* Exit/enter volume. */
+  shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, shadow_sd);
+#  endif
+
+  /* Compute transparency from closures. */
+  return shader_bsdf_transparency(kg, shadow_sd);
+}
+
+#  ifdef __VOLUME__
+ccl_device_inline void integrate_transparent_volume_shadow(INTEGRATOR_STATE_ARGS,
+                                                           const int hit,
+                                                           const int num_recorded_hits,
+                                                           float3 *ccl_restrict throughput)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME);
+
+  /* TODO: deduplicate with surface, or does it not matter for memory usage? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Modify ray position and length to match current segment. */
+  const float start_t = (hit == 0) ? 0.0f : INTEGRATOR_STATE_ARRAY(shadow_isect, hit - 1, t);
+  const float end_t = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(shadow_isect, hit, t) :
+                                                  ray.t;
+  ray.P += start_t * ray.D;
+  ray.t = end_t - start_t;
+
+  shader_setup_from_volume(kg, shadow_sd, &ray);
+
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  volume_shadow_heterogeneous(INTEGRATOR_STATE_PASS, &ray, shadow_sd, throughput, step_size);
+}
+#  endif
+
+ccl_device_inline bool integrate_transparent_shadow(INTEGRATOR_STATE_ARGS, const int num_hits)
+{
+  /* Accumulate shadow for transparent surfaces. */
+  const int num_recorded_hits = min(num_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+
+  for (int hit = 0; hit < num_recorded_hits + 1; hit++) {
+    /* Volume shaders. */
+    if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) {
+#  ifdef __VOLUME__
+      if (!integrator_state_shadow_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        float3 throughput = INTEGRATOR_STATE(shadow_path, throughput);
+        integrate_transparent_volume_shadow(
+            INTEGRATOR_STATE_PASS, hit, num_recorded_hits, &throughput);
+        if (is_zero(throughput)) {
+          return true;
+        }
+
+        INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      }
+#  endif
+    }
+
+    /* Surface shaders. */
+    if (hit < num_recorded_hits) {
+      const float3 shadow = integrate_transparent_surface_shadow(INTEGRATOR_STATE_PASS, hit);
+      const float3 throughput = INTEGRATOR_STATE(shadow_path, throughput) * shadow;
+      if (is_zero(throughput)) {
+        return true;
+      }
+
+      INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) += 1;
+    }
+
+    /* Note we do not need to check max_transparent_bounce here, the number
+     * of intersections is already limited and made opaque in the
+     * INTERSECT_SHADOW kernel. */
+  }
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* There are more hits that we could not recorded due to memory usage,
+     * adjust ray to intersect again from the last hit. */
+    const float last_hit_t = INTEGRATOR_STATE_ARRAY(shadow_isect, num_recorded_hits - 1, t);
+    const float3 ray_P = INTEGRATOR_STATE(shadow_ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(shadow_ray, D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray_offset(ray_P + last_hit_t * ray_D, ray_D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, t) -= last_hit_t;
+  }
+
+  return false;
+}
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device void integrator_shade_shadow(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP);
+  const int num_hits = INTEGRATOR_STATE(shadow_path, num_hits);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* Evaluate transparent shadows. */
+  const bool opaque = integrate_transparent_shadow(INTEGRATOR_STATE_PASS, num_hits);
+  if (opaque) {
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+#endif
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* More intersections to find, continue shadow ray. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    kernel_accum_light(INTEGRATOR_STATE_PASS, render_buffer);
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
new file mode 100644
index 00000000000..73b7cad32be
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_subsurface.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline void integrate_surface_shader_setup(INTEGRATOR_STATE_CONST_ARGS,
+                                                           ShaderData *sd)
+{
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, sd, &ray, &isect);
+}
+
+#ifdef __HOLDOUT__
+ccl_device_forceinline bool integrate_surface_holdout(INTEGRATOR_STATE_CONST_ARGS,
+                                                      ShaderData *sd,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Write holdout transparency to render buffer and stop if fully holdout. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+      (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+    const float3 holdout_weight = shader_holdout_apply(kg, sd);
+    if (kernel_data.background.transparent) {
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      const float transparent = average(holdout_weight * throughput);
+      kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+    }
+    if (isequal_float3(holdout_weight, one_float3())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+#endif /* __HOLDOUT__ */
+
+#ifdef __EMISSION__
+ccl_device_forceinline void integrate_surface_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                                       const ShaderData *sd,
+                                                       ccl_global float *ccl_restrict
+                                                           render_buffer)
+{
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Evaluate emissive closure. */
+  float3 L = shader_emissive_eval(sd);
+
+#  ifdef __HAIR__
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
+      (sd->type & PRIMITIVE_ALL_TRIANGLE))
+#  else
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+#  endif
+  {
+    const float bsdf_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float t = sd->ray_length + INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* Multiple importance sampling, get triangle light pdf,
+     * and compute weight with respect to BSDF pdf. */
+    float pdf = triangle_light_pdf(kg, sd, t);
+    float mis_weight = power_heuristic(bsdf_pdf, pdf);
+
+    L *= mis_weight;
+  }
+
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, L, render_buffer);
+}
+#endif /* __EMISSION__ */
+
+#ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) {
+    return;
+  }
+
+  /* Sample position on a light. */
+  LightSample ls ccl_optional_struct_init;
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) {
+      return;
+    }
+  }
+
+  kernel_assert(ls.pdf != 0.0f);
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, &ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D);
+
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
+  bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
+
+  if (ls.shader & SHADER_USE_MIS) {
+    const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+    bsdf_eval_mul(&bsdf_eval, mis_weight);
+  }
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, &ls, &bsdf_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray);
+  const bool is_light = light_sample_is_light(&ls);
+
+  /* Copy volume stack and enter/exit volume. */
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  if (is_transmission) {
+#  ifdef __VOLUME__
+    shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, sd);
+#  endif
+  }
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+  const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#endif
+
+/* Path tracing: bounce off or through surface with new direction. */
+ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE_ARGS,
+                                                                ShaderData *sd,
+                                                                const RNGState *rng_state)
+{
+  /* Sample BSDF or BSSRDF. */
+  if (!(sd->flag & (SD_BSDF | SD_BSSRDF))) {
+    return LABEL_NONE;
+  }
+
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+  const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u);
+
+#ifdef __SUBSURFACE__
+  /* BSSRDF closure, we schedule subsurface intersection kernel. */
+  if (CLOSURE_IS_BSSRDF(sc->type)) {
+    return subsurface_bounce(INTEGRATOR_STATE_PASS, sd, sc);
+  }
+#endif
+
+  /* BSDF closure, sample direction. */
+  float bsdf_pdf;
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  float3 bsdf_omega_in ccl_optional_struct_init;
+  differential3 bsdf_domega_in ccl_optional_struct_init;
+  int label;
+
+  label = shader_bsdf_sample_closure(
+      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray. Note that clipping works through transparent bounces. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(bsdf_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = (label & LABEL_TRANSPARENT) ?
+                                       INTEGRATOR_STATE(ray, t) - sd->ray_length :
+                                       FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(bsdf_domega_in);
+#endif
+
+  /* Update throughput. */
+  float3 throughput = INTEGRATOR_STATE(path, throughput);
+  throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path,
+                             diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+    }
+  }
+
+  /* Update path state */
+  if (label & LABEL_TRANSPARENT) {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = bsdf_pdf;
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(bsdf_pdf,
+                                                      INTEGRATOR_STATE(path, min_ray_pdf));
+  }
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return label;
+}
+
+#ifdef __VOLUME__
+ccl_device_forceinline bool integrate_surface_volume_only_bounce(INTEGRATOR_STATE_ARGS,
+                                                                 ShaderData *sd)
+{
+  if (!path_state_volume_next(INTEGRATOR_STATE_PASS)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray position, direction stays unchanged. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, -sd->Ng);
+
+  /* Clipping works through transparent. */
+  INTEGRATOR_STATE_WRITE(ray, t) -= sd->ray_length;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+#  endif
+
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+
+  return LABEL_TRANSMIT | LABEL_TRANSPARENT;
+}
+#endif
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+ccl_device_forceinline void integrate_surface_ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  ifdef __KERNEL_OPTIX__
+  optixDirectCall<void>(2, INTEGRATOR_STATE_PASS, sd, rng_state, render_buffer);
+}
+
+extern "C" __device__ void __direct_callable__ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  endif /* __KERNEL_OPTIX__ */
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+  const float3 ao_N = shader_bsdf_ao_normal(kg, sd);
+  float3 ao_D;
+  float ao_pdf;
+  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+    Ray ray ccl_optional_struct_init;
+    ray.P = ray_offset(sd->P, sd->Ng);
+    ray.D = ao_D;
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+    ray.time = sd->time;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+
+    Intersection isect ccl_optional_struct_init;
+    if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
+      ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                                 render_buffer);
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
+    }
+  }
+}
+#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+
+template<uint node_feature_mask>
+ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
+                                  ccl_global float *ccl_restrict render_buffer)
+
+{
+  PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_SURFACE_SETUP);
+
+  /* Setup shader data. */
+  ShaderData sd;
+  integrate_surface_shader_setup(INTEGRATOR_STATE_PASS, &sd);
+  PROFILING_SHADER(sd.object, sd.shader);
+
+  int continue_path_label = 0;
+
+  /* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+  if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+#endif
+
+    {
+      const int path_flag = INTEGRATOR_STATE(path, flag);
+#ifdef __SUBSURFACE__
+      /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
+      if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
+#endif
+      {
+        /* Evaluate shader. */
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
+        shader_eval_surface<node_feature_mask>(
+            INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
+      }
+    }
+
+#ifdef __SUBSURFACE__
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      /* When coming from inside subsurface scattering, setup a diffuse
+       * closure to perform lighting at the exit point. */
+      INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SUBSURFACE;
+      subsurface_shader_data_setup(INTEGRATOR_STATE_PASS, &sd);
+    }
+#endif
+
+    shader_prepare_surface_closures(INTEGRATOR_STATE_PASS, &sd);
+
+#ifdef __HOLDOUT__
+    /* Evaluate holdout. */
+    if (!integrate_surface_holdout(INTEGRATOR_STATE_PASS, &sd, render_buffer)) {
+      return false;
+    }
+#endif
+
+#ifdef __EMISSION__
+    /* Write emission. */
+    if (sd.flag & SD_EMISSION) {
+      integrate_surface_emission(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+    }
+#endif
+
+#ifdef __PASSES__
+    /* Write render passes. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES);
+    kernel_write_data_passes(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Load random number state. */
+    RNGState rng_state;
+    path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+    /* Perform path termination. Most paths have already been terminated in
+     * the intersect_closest kernel, this is just for emission and for dividing
+     * throughput by the probability at the right moment. */
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+                                  0.0f :
+                                  path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                      path_flag);
+    if (probability == 0.0f) {
+      return false;
+    }
+    else if (probability != 1.0f) {
+      INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+    }
+
+#ifdef __DENOISING_FEATURES__
+    kernel_write_denoising_features_surface(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+#ifdef __SHADOW_CATCHER__
+    kernel_write_shadow_catcher_bounce_data(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Direct light. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
+    integrate_surface_direct_light(INTEGRATOR_STATE_PASS, &sd, &rng_state);
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+    /* Ambient occlusion pass. */
+    if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+          (INTEGRATOR_STATE(path, flag) & PATH_RAY_CAMERA)) {
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+        integrate_surface_ao_pass(INTEGRATOR_STATE_PASS, &sd, &rng_state, render_buffer);
+      }
+    }
+#endif
+
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_bsdf_bssrdf_bounce(
+        INTEGRATOR_STATE_PASS, &sd, &rng_state);
+#ifdef __VOLUME__
+  }
+  else {
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_volume_only_bounce(INTEGRATOR_STATE_PASS, &sd);
+  }
+
+  if (continue_path_label & LABEL_TRANSMIT) {
+    /* Enter/Exit volume. */
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, &sd);
+  }
+#endif
+
+  return continue_path_label != 0;
+}
+
+template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE,
+         int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
+ccl_device_forceinline void integrator_shade_surface(INTEGRATOR_STATE_ARGS,
+                                                     ccl_global float *ccl_restrict render_buffer)
+{
+  if (integrate_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, render_buffer)) {
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+    }
+    else {
+      kernel_assert(INTEGRATOR_STATE(ray, t) != 0.0f);
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+  }
+  else {
+    INTEGRATOR_PATH_TERMINATE(current_kernel);
+  }
+}
+
+ccl_device_forceinline void integrator_shade_surface_raytrace(
+    INTEGRATOR_STATE_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  integrator_shade_surface<KERNEL_FEATURE_NODE_MASK_SURFACE,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE>(INTEGRATOR_STATE_PASS,
+                                                                            render_buffer);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
new file mode 100644
index 00000000000..4a864b1e6ce
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -0,0 +1,1015 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Events for probalistic scattering */
+
+typedef enum VolumeIntegrateEvent {
+  VOLUME_PATH_SCATTERED = 0,
+  VOLUME_PATH_ATTENUATED = 1,
+  VOLUME_PATH_MISSED = 2
+} VolumeIntegrateEvent;
+
+typedef struct VolumeIntegrateResult {
+  /* Throughput and offset for direct light scattering. */
+  bool direct_scatter;
+  float3 direct_throughput;
+  float direct_t;
+  ShaderVolumePhases direct_phases;
+
+  /* Throughput and offset for indirect light scattering. */
+  bool indirect_scatter;
+  float3 indirect_throughput;
+  float indirect_t;
+  ShaderVolumePhases indirect_phases;
+} VolumeIntegrateResult;
+
+/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
+ * and precision issues.
+ * todo: this value could be tweaked or turned into a probability to avoid unnecessary
+ * work in volumes and subsurface scattering. */
+#  define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+/* Volume shader properties
+ *
+ * extinction coefficient = absorption coefficient + scattering coefficient
+ * sigma_t = sigma_a + sigma_s */
+
+typedef struct VolumeShaderCoefficients {
+  float3 sigma_t;
+  float3 sigma_s;
+  float3 emission;
+} VolumeShaderCoefficients;
+
+/* Evaluate shader to get extinction coefficient at P. */
+ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                                   ShaderData *ccl_restrict sd,
+                                                   float3 *ccl_restrict extinction)
+{
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & SD_EXTINCTION)) {
+    return false;
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  *extinction = sd->closure_transparent_extinction * density;
+  return true;
+}
+
+/* Evaluate shader to get absorption, scattering and emission at P. */
+ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                            ShaderData *ccl_restrict sd,
+                                            VolumeShaderCoefficients *coeff)
+{
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
+    return false;
+  }
+
+  coeff->sigma_s = zero_float3();
+  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
+  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
+
+  if (sd->flag & SD_SCATTER) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_VOLUME(sc->type)) {
+        coeff->sigma_s += sc->weight;
+      }
+    }
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  coeff->sigma_s *= density;
+  coeff->sigma_t *= density;
+  coeff->emission *= density;
+
+  return true;
+}
+
+ccl_device_forceinline void volume_step_init(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             const float object_step_size,
+                                             float t,
+                                             float *step_size,
+                                             float *step_shade_offset,
+                                             float *steps_offset,
+                                             int *max_steps)
+{
+  if (object_step_size == FLT_MAX) {
+    /* Homogeneous volume. */
+    *step_size = t;
+    *step_shade_offset = 0.0f;
+    *steps_offset = 1.0f;
+    *max_steps = 1;
+  }
+  else {
+    /* Heterogeneous volume. */
+    *max_steps = kernel_data.integrator.volume_max_steps;
+    float step = min(object_step_size, t);
+
+    /* compute exact steps in advance for malloc */
+    if (t > *max_steps * step) {
+      step = t / (float)*max_steps;
+    }
+
+    *step_size = step;
+
+    /* Perform shading at this offset within a step, to integrate over
+     * over the entire step segment. */
+    *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4);
+
+    /* Shift starting point of all segment by this random amount to avoid
+     * banding artifacts from the volume bounding shape. */
+    *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3);
+  }
+}
+
+/* Volume Shadows
+ *
+ * These functions are used to attenuate shadow rays to lights. Both absorption
+ * and scattering will block light, represented by the extinction coefficient. */
+
+#  if 0
+/* homogeneous volume: assume shader evaluation at the starts gives
+ * the extinction coefficient for the entire line segment */
+ccl_device void volume_shadow_homogeneous(INTEGRATOR_STATE_ARGS,
+                                          Ray *ccl_restrict ray,
+                                          ShaderData *ccl_restrict sd,
+                                          float3 *ccl_restrict throughput)
+{
+  float3 sigma_t = zero_float3();
+
+  if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+    *throughput *= volume_color_transmittance(sigma_t, ray->t);
+  }
+}
+#  endif
+
+/* heterogeneous volume: integrate stepping through the volume until we
+ * reach the end, get absorbed entirely, or run out of iterations */
+ccl_device void volume_shadow_heterogeneous(INTEGRATOR_STATE_ARGS,
+                                            Ray *ccl_restrict ray,
+                                            ShaderData *ccl_restrict sd,
+                                            float3 *ccl_restrict throughput,
+                                            const float object_step_size)
+{
+  /* Load random number state. */
+  RNGState rng_state;
+  shadow_path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  float3 tp = *throughput;
+
+  /* Prepare for stepping.
+   * For shadows we do not offset all segments, since the starting point is
+   * already a random distance inside the volume. It also appears to create
+   * banding artifacts for unknown reasons. */
+  int max_steps;
+  float step_size, step_shade_offset, unused;
+  volume_step_init(kg,
+                   &rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &unused,
+                   &max_steps);
+  const float steps_offset = 1.0f;
+
+  /* compute extinction at the start */
+  float t = 0.0f;
+
+  float3 sum = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* advance to new position */
+    float new_t = min(ray->t, (i + steps_offset) * step_size);
+    float dt = new_t - t;
+
+    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
+    float3 sigma_t = zero_float3();
+
+    /* compute attenuation over segment */
+    sd->P = new_P;
+    if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+      /* Compute expf() only for every Nth step, to save some calculations
+       * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
+       * check then. */
+      sum += (-sigma_t * dt);
+      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
+        tp = *throughput * exp3(sum);
+
+        /* stop if nearly all light is blocked */
+        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
+            tp.z < VOLUME_THROUGHPUT_EPSILON)
+          break;
+      }
+    }
+
+    /* stop if at the end of the volume */
+    t = new_t;
+    if (t == ray->t) {
+      /* Update throughput in case we haven't done it above */
+      tp = *throughput * exp3(sum);
+      break;
+    }
+  }
+
+  *throughput = tp;
+}
+
+/* Equi-angular sampling as in:
+ * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+
+ccl_device float volume_equiangular_sample(const Ray *ccl_restrict ray,
+                                           const float3 light_P,
+                                           const float xi,
+                                           float *pdf)
+{
+  const float t = ray->t;
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
+  if (UNLIKELY(theta_b == theta_a)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return min(t, delta + t_); /* min is only for float precision errors */
+}
+
+ccl_device float volume_equiangular_pdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return pdf;
+}
+
+ccl_device float volume_equiangular_cdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  float delta = dot((light_P - ray->P), ray->D);
+  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float theta_sample = atan2f(t_, D);
+  const float cdf = (theta_sample - theta_a) / (theta_b - theta_a);
+
+  return cdf;
+}
+
+/* Distance sampling */
+
+ccl_device float volume_distance_sample(
+    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
+{
+  /* xi is [0, 1[ so log(0) should never happen, division by zero is
+   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
+  float sample_sigma_t = volume_channel_get(sigma_t, channel);
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float sample_transmittance = volume_channel_get(full_transmittance, channel);
+
+  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
+
+  *transmittance = volume_color_transmittance(sigma_t, sample_t);
+  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
+
+  /* todo: optimization: when taken together with hit/miss decision,
+   * the full_transmittance cancels out drops out and xi does not
+   * need to be remapped */
+
+  return sample_t;
+}
+
+ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+{
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+
+  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
+}
+
+/* Emission */
+
+ccl_device float3 volume_emission_integrate(VolumeShaderCoefficients *coeff,
+                                            int closure_flag,
+                                            float3 transmittance,
+                                            float t)
+{
+  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+   * this goes to E * t as sigma_t goes to zero
+   *
+   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+  float3 emission = coeff->emission;
+
+  if (closure_flag & SD_EXTINCTION) {
+    float3 sigma_t = coeff->sigma_t;
+
+    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
+    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
+    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
+  }
+  else
+    emission *= t;
+
+  return emission;
+}
+
+/* Volume Integration */
+
+typedef struct VolumeIntegrateState {
+  /* Volume segment extents. */
+  float start_t;
+  float end_t;
+
+  /* If volume is absorption-only up to this point, and no probabilistic
+   * scattering or termination has been used yet. */
+  bool absorption_only;
+
+  /* Random numbers for scattering. */
+  float rscatter;
+  float rphase;
+
+  /* Multiple importance sampling. */
+  VolumeSampleMethod direct_sample_method;
+  bool use_mis;
+  float distance_pdf;
+  float equiangular_pdf;
+} VolumeIntegrateState;
+
+ccl_device_forceinline void volume_integrate_step_scattering(
+    const ShaderData *sd,
+    const Ray *ray,
+    const float3 equiangular_light_P,
+    const VolumeShaderCoefficients &ccl_restrict coeff,
+    const float3 transmittance,
+    VolumeIntegrateState &ccl_restrict vstate,
+    VolumeIntegrateResult &ccl_restrict result)
+{
+  /* Pick random color channel, we use the Veach one-sample
+   * model with balance heuristic for the channels. */
+  const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+  float3 channel_pdf;
+  const int channel = volume_sample_channel(
+      albedo, result.indirect_throughput, vstate.rphase, &channel_pdf);
+
+  /* Equiangular sampling for direct lighting. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
+    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) {
+      const float new_dt = result.direct_t - vstate.start_t;
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+
+      result.direct_scatter = true;
+      result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf;
+      shader_copy_volume_phases(&result.direct_phases, sd);
+
+      /* Multiple importance sampling. */
+      if (vstate.use_mis) {
+        const float distance_pdf = vstate.distance_pdf *
+                                   dot(channel_pdf, coeff.sigma_t * new_transmittance);
+        const float mis_weight = 2.0f * power_heuristic(vstate.equiangular_pdf, distance_pdf);
+        result.direct_throughput *= mis_weight;
+      }
+    }
+    else {
+      result.direct_throughput *= transmittance;
+      vstate.distance_pdf *= dot(channel_pdf, transmittance);
+    }
+  }
+
+  /* Distance sampling for indirect and optional direct lighting. */
+  if (!result.indirect_scatter) {
+    /* decide if we will scatter or continue */
+    const float sample_transmittance = volume_channel_get(transmittance, channel);
+
+    if (1.0f - vstate.rscatter >= sample_transmittance) {
+      /* compute sampling distance */
+      const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel);
+      const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t;
+      const float new_t = vstate.start_t + new_dt;
+
+      /* transmittance and pdf */
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);
+
+      /* throughput */
+      result.indirect_scatter = true;
+      result.indirect_t = new_t;
+      result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
+      shader_copy_volume_phases(&result.indirect_phases, sd);
+
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        /* If using distance sampling for direct light, just copy parameters
+         * of indirect light since we scatter at the same point then. */
+        result.direct_scatter = true;
+        result.direct_t = result.indirect_t;
+        result.direct_throughput = result.indirect_throughput;
+        shader_copy_volume_phases(&result.direct_phases, sd);
+
+        /* Multiple importance sampling. */
+        if (vstate.use_mis) {
+          const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
+          const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
+                                                   equiangular_pdf);
+          result.direct_throughput *= 2.0f * mis_weight;
+        }
+      }
+    }
+    else {
+      /* throughput */
+      const float pdf = dot(channel_pdf, transmittance);
+      result.indirect_throughput *= transmittance / pdf;
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        vstate.distance_pdf *= pdf;
+      }
+
+      /* remap rscatter so we can reuse it and keep thing stratified */
+      vstate.rscatter = 1.0f - (1.0f - vstate.rscatter) / sample_transmittance;
+    }
+  }
+}
+
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probabilistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device_forceinline void volume_integrate_heterogeneous(
+    INTEGRATOR_STATE_ARGS,
+    Ray *ccl_restrict ray,
+    ShaderData *ccl_restrict sd,
+    const RNGState *rng_state,
+    ccl_global float *ccl_restrict render_buffer,
+    const float object_step_size,
+    const VolumeSampleMethod direct_sample_method,
+    const float3 equiangular_light_P,
+    VolumeIntegrateResult &result)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INTEGRATE);
+
+  /* Prepare for stepping.
+   * Using a different step offset for the first step avoids banding artifacts. */
+  int max_steps;
+  float step_size, step_shade_offset, steps_offset;
+  volume_step_init(kg,
+                   rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &steps_offset,
+                   &max_steps);
+
+  /* Initialize volume integration state. */
+  VolumeIntegrateState vstate ccl_optional_struct_init;
+  vstate.start_t = 0.0f;
+  vstate.end_t = 0.0f;
+  vstate.absorption_only = true;
+  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
+  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
+
+  /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
+  vstate.direct_sample_method = direct_sample_method;
+  vstate.use_mis = (direct_sample_method == VOLUME_SAMPLE_MIS);
+  if (vstate.use_mis) {
+    if (vstate.rscatter < 0.5f) {
+      vstate.rscatter *= 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_DISTANCE;
+    }
+    else {
+      vstate.rscatter = (vstate.rscatter - 0.5f) * 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+  }
+  vstate.equiangular_pdf = 0.0f;
+  vstate.distance_pdf = 1.0f;
+
+  /* Initialize volume integration result. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  result.direct_throughput = throughput;
+  result.indirect_throughput = throughput;
+
+  /* Equiangular sampling: compute distance and PDF in advance. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) {
+    result.direct_t = volume_equiangular_sample(
+        ray, equiangular_light_P, vstate.rscatter, &vstate.equiangular_pdf);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  const bool write_denoising_features = (INTEGRATOR_STATE(path, flag) &
+                                         PATH_RAY_DENOISING_FEATURES);
+  float3 accum_albedo = zero_float3();
+#  endif
+  float3 accum_emission = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* Advance to new position */
+    vstate.end_t = min(ray->t, (i + steps_offset) * step_size);
+    const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset;
+    sd->P = ray->P + ray->D * shade_t;
+
+    /* compute segment */
+    VolumeShaderCoefficients coeff ccl_optional_struct_init;
+    if (volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &coeff)) {
+      const int closure_flag = sd->flag;
+
+      /* Evaluate transmittance over segment. */
+      const float dt = (vstate.end_t - vstate.start_t);
+      const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
+                                       volume_color_transmittance(coeff.sigma_t, dt) :
+                                       one_float3();
+
+      /* Emission. */
+      if (closure_flag & SD_EMISSION) {
+        /* Only write emission before indirect light scatter position, since we terminate
+         * stepping at that point if we have already found a direct light scatter position. */
+        if (!result.indirect_scatter) {
+          const float3 emission = volume_emission_integrate(
+              &coeff, closure_flag, transmittance, dt);
+          accum_emission += emission;
+        }
+      }
+
+      if (closure_flag & SD_EXTINCTION) {
+        if ((closure_flag & SD_SCATTER) || !vstate.absorption_only) {
+#  ifdef __DENOISING_FEATURES__
+          /* Accumulate albedo for denoising features. */
+          if (write_denoising_features && (closure_flag & SD_SCATTER)) {
+            const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+            accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance);
+          }
+#  endif
+
+          /* Scattering and absorption. */
+          volume_integrate_step_scattering(
+              sd, ray, equiangular_light_P, coeff, transmittance, vstate, result);
+        }
+        else {
+          /* Absorption only. */
+          result.indirect_throughput *= transmittance;
+          result.direct_throughput *= transmittance;
+        }
+
+        /* Stop if nearly all light blocked. */
+        if (!result.indirect_scatter) {
+          if (max3(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            result.indirect_throughput = zero_float3();
+            break;
+          }
+        }
+        else if (!result.direct_scatter) {
+          if (max3(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            break;
+          }
+        }
+      }
+
+      /* If we have scattering data for both direct and indirect, we're done. */
+      if (result.direct_scatter && result.indirect_scatter) {
+        break;
+      }
+    }
+
+    /* Stop if at the end of the volume. */
+    vstate.start_t = vstate.end_t;
+    if (vstate.start_t == ray->t) {
+      break;
+    }
+  }
+
+  /* Write accumulated emisison. */
+  if (!is_zero(accum_emission)) {
+    kernel_accum_emission(
+        INTEGRATOR_STATE_PASS, result.indirect_throughput, accum_emission, render_buffer);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  /* Write denoising features. */
+  if (write_denoising_features) {
+    kernel_write_denoising_features_volume(
+        INTEGRATOR_STATE_PASS, accum_albedo, result.indirect_scatter, render_buffer);
+  }
+#  endif /* __DENOISING_FEATURES__ */
+}
+
+#  ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline bool integrate_volume_sample_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          LightSample *ccl_restrict ls)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!kernel_data.integrator.use_direct_light) {
+    return false;
+  }
+
+  /* Sample position on a light. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const uint bounce = INTEGRATOR_STATE(path, bounce);
+  float light_u, light_v;
+  path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+  light_distribution_sample_from_volume_segment(
+      kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls);
+
+  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+    return false;
+  }
+
+  return true;
+}
+
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          const float3 P,
+                                                          const ShaderVolumePhases *ccl_restrict
+                                                              phases,
+                                                          const float3 throughput,
+                                                          LightSample *ccl_restrict ls)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);
+
+  if (!kernel_data.integrator.use_direct_light) {
+    return;
+  }
+
+  /* Sample position on the same light again, now from the shading
+   * point where we scattered.
+   *
+   * TODO: decorrelate random numbers and use light_sample_new_position to
+   * avoid resampling the CDF. */
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) {
+      return;
+    }
+  }
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  BsdfEval phase_eval ccl_optional_struct_init;
+  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
+
+  if (ls->shader & SHADER_USE_MIS) {
+    float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+    bsdf_eval_mul(&phase_eval, mis_weight);
+  }
+
+  bsdf_eval_mul3(&phase_eval, light_eval / ls->pdf);
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, ls, &phase_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
+  const bool is_light = light_sample_is_light(ls);
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= PATH_RAY_VOLUME_PASS;
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            one_float3() :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#  endif
+
+/* Path tracing: scatter in new direction using phase function */
+ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state,
+                                                           const ShaderVolumePhases *phases)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT);
+
+  float phase_u, phase_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v);
+
+  /* Phase closure, sample direction. */
+  float phase_pdf;
+  BsdfEval phase_eval ccl_optional_struct_init;
+  float3 phase_omega_in ccl_optional_struct_init;
+  differential3 phase_domega_in ccl_optional_struct_init;
+
+  const int label = shader_volume_phase_sample(kg,
+                                               sd,
+                                               phases,
+                                               phase_u,
+                                               phase_v,
+                                               &phase_eval,
+                                               &phase_omega_in,
+                                               &phase_domega_in,
+                                               &phase_pdf);
+
+  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
+    return false;
+  }
+
+  /* Setup ray. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(phase_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(phase_domega_in);
+#  endif
+
+  /* Update throughput. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+  }
+
+  /* Update path state */
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(phase_pdf,
+                                                    INTEGRATOR_STATE(path, min_ray_pdf));
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return true;
+}
+
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device VolumeIntegrateEvent volume_integrate(INTEGRATOR_STATE_ARGS,
+                                                 Ray *ccl_restrict ray,
+                                                 ccl_global float *ccl_restrict render_buffer)
+{
+  ShaderData sd;
+  shader_setup_from_volume(kg, &sd, ray);
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* Sample light ahead of volume stepping, for equiangular sampling. */
+  /* TODO: distant lights are ignored now, but could instead use even distribution. */
+  LightSample ls ccl_optional_struct_init;
+  const bool need_light_sample = !(INTEGRATOR_STATE(path, flag) & PATH_RAY_TERMINATE);
+  const bool have_equiangular_sample = need_light_sample &&
+                                       integrate_volume_sample_light(
+                                           INTEGRATOR_STATE_PASS, &sd, &rng_state, &ls) &&
+                                       (ls.t != FLT_MAX);
+
+  VolumeSampleMethod direct_sample_method = (have_equiangular_sample) ?
+                                                volume_stack_sample_method(INTEGRATOR_STATE_PASS) :
+                                                VOLUME_SAMPLE_DISTANCE;
+
+  /* Step through volume. */
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  /* TODO: expensive to zero closures? */
+  VolumeIntegrateResult result = {};
+  volume_integrate_heterogeneous(INTEGRATOR_STATE_PASS,
+                                 ray,
+                                 &sd,
+                                 &rng_state,
+                                 render_buffer,
+                                 step_size,
+                                 direct_sample_method,
+                                 ls.P,
+                                 result);
+
+  /* Perform path termination. The intersect_closest will have already marked this path
+   * to be terminated. That will shading evaluating to leave out any scattering closures,
+   * but emission and absorption are still handled for multiple importance sampling. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
+                                0.0f :
+                                path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                    path_flag);
+  if (probability == 0.0f) {
+    return VOLUME_PATH_MISSED;
+  }
+
+  /* Direct light. */
+  if (result.direct_scatter) {
+    const float3 direct_P = ray->P + result.direct_t * ray->D;
+    result.direct_throughput /= probability;
+    integrate_volume_direct_light(INTEGRATOR_STATE_PASS,
+                                  &sd,
+                                  &rng_state,
+                                  direct_P,
+                                  &result.direct_phases,
+                                  result.direct_throughput,
+                                  &ls);
+  }
+
+  /* Indirect light.
+   *
+   * Only divide throughput by probability if we scatter. For the attenuation
+   * case the next surface will already do this division. */
+  if (result.indirect_scatter) {
+    result.indirect_throughput /= probability;
+  }
+  INTEGRATOR_STATE_WRITE(path, throughput) = result.indirect_throughput;
+
+  if (result.indirect_scatter) {
+    sd.P = ray->P + result.indirect_t * ray->D;
+
+    if (integrate_volume_phase_scatter(
+            INTEGRATOR_STATE_PASS, &sd, &rng_state, &result.indirect_phases)) {
+      return VOLUME_PATH_SCATTERED;
+    }
+    else {
+      return VOLUME_PATH_MISSED;
+    }
+  }
+  else {
+    return VOLUME_PATH_ATTENUATED;
+  }
+}
+
+#endif
+
+ccl_device void integrator_shade_volume(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_SETUP);
+
+#ifdef __VOLUME__
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  /* Set ray length to current segment. */
+  ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
+
+  /* Clean volume stack for background rays. */
+  if (isect.prim == PRIM_NONE) {
+    volume_stack_clean(INTEGRATOR_STATE_PASS);
+  }
+
+  VolumeIntegrateEvent event = volume_integrate(INTEGRATOR_STATE_PASS, &ray, render_buffer);
+
+  if (event == VOLUME_PATH_SCATTERED) {
+    /* Queue intersect_closest kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+  else if (event == VOLUME_PATH_MISSED) {
+    /* End path. */
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    return;
+  }
+  else {
+    /* Continue to background, light or surface. */
+    if (isect.prim == PRIM_NONE) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      return;
+    }
+    else if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+          INTEGRATOR_STATE_PASS, &isect, shader, flags);
+      return;
+    }
+  }
+#endif /* __VOLUME__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
new file mode 100644
index 00000000000..8cef9cf31e2
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Integrator State
+ *
+ * This file defines the data structures that define the state of a path. Any state that is
+ * preserved and passed between kernel executions is part of this.
+ *
+ * The size of this state must be kept as small as possible, to reduce cache misses and keep memory
+ * usage under control on GPUs that may execute millions of kernels.
+ *
+ * Memory may be allocated and passed along in different ways depending on the device. There may
+ * be a scalar layout, or AoS or SoA layout for batches. The state may be passed along as a pointer
+ * to every kernel, or the pointer may exist at program scope or in constant memory. To abstract
+ * these differences between devices and experiment with different layouts, macros are used.
+ *
+ * INTEGRATOR_STATE_ARGS: prepend to argument definitions for every function that accesses
+ * path state.
+ * INTEGRATOR_STATE_CONST_ARGS: same as INTEGRATOR_STATE_ARGS, when state is read-only
+ * INTEGRATOR_STATE_PASS: use to pass along state to other functions access it.
+ *
+ * INTEGRATOR_STATE(x, y): read nested struct member x.y of IntegratorState
+ * INTEGRATOR_STATE_WRITE(x, y): write to nested struct member x.y of IntegratorState
+ *
+ * INTEGRATOR_STATE_ARRAY(x, index, y): read x[index].y
+ * INTEGRATOR_STATE_ARRAY_WRITE(x, index, y): write x[index].y
+ *
+ * INTEGRATOR_STATE_COPY(to_x, from_x): copy contents of one nested struct to another
+ *
+ * INTEGRATOR_STATE_IS_NULL: test if any integrator state is available, for shader evaluation
+ * INTEGRATOR_STATE_PASS_NULL: use to pass empty state to other functions.
+ *
+ * NOTE: if we end up with a device that passes no arguments, the leading comma will be a problem.
+ * Can solve it with more macros if we encouter it, but rather ugly so postpone for now.
+ */
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_types.h"
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Constants
+ *
+ * TODO: these could be made dynamic depending on the features used in the scene. */
+
+#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
+#define INTEGRATOR_SHADOW_ISECT_SIZE 4
+
+/* Data structures */
+
+/* Integrator State
+ *
+ * CPU rendering path state with AoS layout. */
+typedef struct IntegratorStateCPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+} IntegratorStateCPU;
+
+/* Path Queue
+ *
+ * Keep track of which kernels are queued to be executed next in the path
+ * for GPU rendering. */
+typedef struct IntegratorQueueCounter {
+  int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM];
+} IntegratorQueueCounter;
+
+/* Integrator State GPU
+ *
+ * GPU rendering path state with SoA layout. */
+typedef struct IntegratorStateGPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+
+  /* Count number of queued kernels. */
+  IntegratorQueueCounter *queue_counter;
+
+  /* Count number of kernels queued for specific shaders. */
+  int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
+
+  /* Index of path which will be used by a next shadow catcher split.  */
+  int *next_shadow_catcher_path_index;
+} IntegratorStateGPU;
+
+/* Abstraction
+ *
+ * Macros to access data structures on different devices.
+ *
+ * Note that there is a special access function for the shadow catcher state. This access is to
+ * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
+ * from a kernel which operates on a shadow catcher state will cause bad memory acces. */
+
+#ifdef __KERNEL_CPU__
+
+/* Scalar access on CPU. */
+
+typedef IntegratorStateCPU *ccl_restrict IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        const IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, NULL
+#  define INTEGRATOR_STATE_IS_NULL (state == NULL)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct.member)
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) (state->nested_struct.member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct[array_index].member)
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    ((state)->nested_struct[array_index].member)
+
+#else /* __KERNEL_CPU__ */
+
+/* Array access on GPU with Structure-of-Arrays. */
+
+typedef int IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, -1
+#  define INTEGRATOR_STATE_IS_NULL (state == -1)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    kernel_integrator_state.nested_struct.member[state]
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) INTEGRATOR_STATE(nested_struct, member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    kernel_integrator_state.nested_struct[array_index].member[state]
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member)
+
+#endif /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
new file mode 100644
index 00000000000..8477efd7b66
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+#include "util/util_atomic.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Control Flow
+ *
+ * Utilities for control flow between kernels. The implementation may differ per device
+ * or even be handled on the host side. To abstract such differences, experiment with
+ * different implementations and for debugging, this is abstracted using macros.
+ *
+ * There is a main path for regular path tracing camera for path tracing. Shadows for next
+ * event estimation branch off from this into their own path, that may be computed in
+ * parallel while the main path continues.
+ *
+ * Each kernel on the main path must call one of these functions. These may not be called
+ * multiple times from the same kernel.
+ *
+ * INTEGRATOR_PATH_INIT(next_kernel)
+ * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
+ * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ *
+ * For the shadow path similar functions are used, and again each shadow kernel must call
+ * one of them, and only once.
+ */
+
+#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(path, queued_kernel) == 0)
+#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0)
+
+#ifdef __KERNEL_GPU__
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_sub_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+
+#else
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+    }
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+      (void)current_kernel; \
+    }
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
new file mode 100644
index 00000000000..41dd1bfcdbf
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -0,0 +1,163 @@
+
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/************************************ Path State *****************************/
+
+KERNEL_STRUCT_BEGIN(path)
+/* Index of a pixel within the device render buffer where this path will write its result.
+ * To get an actual offset within the buffer the value needs to be multiplied by the
+ * `kernel_data.film.pass_stride`.
+ *
+ * The multiplication is delayed for later, so that state can use 32bit integer. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
+/* Current sample number. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current diffuse ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current glossy ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transmission ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume bounds ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* Random number generator seed. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
+/* Random number dimension offset. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Multiple importance sampling
+ * The PDF of BSDF sampling at the last scatter point, and distance to the
+ * last scatter point minus the last ray segment. This distance lets us
+ * compute the complete distance through transparent surfaces and volumes. */
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
+/* Filter glossy. */
+KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Denoising. */
+KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+/* Shader sorting. */
+/* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(path)
+
+/************************************** Ray ***********************************/
+
+KERNEL_STRUCT_BEGIN(ray)
+KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(ray)
+
+/*************************** Intersection result ******************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(isect)
+KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_MEMBER(isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(isect)
+
+/*************** Subsurface closure state for subsurface kernel ***************/
+
+KERNEL_STRUCT_BEGIN(subsurface)
+KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, roughness, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_END(subsurface)
+
+/********************************** Volume Stack ******************************/
+
+KERNEL_STRUCT_BEGIN(volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+
+/********************************* Shadow Path State **************************/
+
+KERNEL_STRUCT_BEGIN(shadow_path)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput for shadow pass. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Number of intersections found by ray-tracing. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_path)
+
+/********************************** Shadow Ray *******************************/
+
+KERNEL_STRUCT_BEGIN(shadow_ray)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_ray)
+
+/*********************** Shadow Intersection result **************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(shadow_isect)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
+
+/**************************** Shadow Volume Stack *****************************/
+
+KERNEL_STRUCT_BEGIN(shadow_volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
new file mode 100644
index 00000000000..cdf412fe22f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/kernel_differential.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray */
+
+ccl_device_forceinline void integrator_state_write_ray(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(ray, dP) = ray->dP;
+  INTEGRATOR_STATE_WRITE(ray, dD) = ray->dD;
+}
+
+ccl_device_forceinline void integrator_state_read_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                      Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(ray, P);
+  ray->D = INTEGRATOR_STATE(ray, D);
+  ray->t = INTEGRATOR_STATE(ray, t);
+  ray->time = INTEGRATOR_STATE(ray, time);
+  ray->dP = INTEGRATOR_STATE(ray, dP);
+  ray->dD = INTEGRATOR_STATE(ray, dD);
+}
+
+/* Shadow Ray */
+
+ccl_device_forceinline void integrator_state_write_shadow_ray(INTEGRATOR_STATE_ARGS,
+                                                              const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(shadow_ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(shadow_ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(shadow_ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(shadow_ray, dP) = ray->dP;
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                             Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(shadow_ray, P);
+  ray->D = INTEGRATOR_STATE(shadow_ray, D);
+  ray->t = INTEGRATOR_STATE(shadow_ray, t);
+  ray->time = INTEGRATOR_STATE(shadow_ray, time);
+  ray->dP = INTEGRATOR_STATE(shadow_ray, dP);
+  ray->dD = differential_zero_compact();
+}
+
+/* Intersection */
+
+ccl_device_forceinline void integrator_state_write_isect(INTEGRATOR_STATE_ARGS,
+                                                         const Intersection *ccl_restrict isect)
+{
+  INTEGRATOR_STATE_WRITE(isect, t) = isect->t;
+  INTEGRATOR_STATE_WRITE(isect, u) = isect->u;
+  INTEGRATOR_STATE_WRITE(isect, v) = isect->v;
+  INTEGRATOR_STATE_WRITE(isect, object) = isect->object;
+  INTEGRATOR_STATE_WRITE(isect, prim) = isect->prim;
+  INTEGRATOR_STATE_WRITE(isect, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_WRITE(isect, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                        Intersection *ccl_restrict isect)
+{
+  isect->prim = INTEGRATOR_STATE(isect, prim);
+  isect->object = INTEGRATOR_STATE(isect, object);
+  isect->type = INTEGRATOR_STATE(isect, type);
+  isect->u = INTEGRATOR_STATE(isect, u);
+  isect->v = INTEGRATOR_STATE(isect, v);
+  isect->t = INTEGRATOR_STATE(isect, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE(isect, Ng);
+#endif
+}
+
+ccl_device_forceinline VolumeStack integrator_state_read_volume_stack(INTEGRATOR_STATE_CONST_ARGS,
+                                                                      int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline void integrator_state_write_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                int i,
+                                                                VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, shader) = entry.shader;
+}
+
+ccl_device_forceinline bool integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+/* Shadow Intersection */
+
+ccl_device_forceinline void integrator_state_write_shadow_isect(
+    INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect, const int index)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, t) = isect->t;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, u) = isect->u;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, v) = isect->v;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, object) = isect->object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, prim) = isect->prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                               Intersection *ccl_restrict isect,
+                                                               const int index)
+{
+  isect->prim = INTEGRATOR_STATE_ARRAY(shadow_isect, index, prim);
+  isect->object = INTEGRATOR_STATE_ARRAY(shadow_isect, index, object);
+  isect->type = INTEGRATOR_STATE_ARRAY(shadow_isect, index, type);
+  isect->u = INTEGRATOR_STATE_ARRAY(shadow_isect, index, u);
+  isect->v = INTEGRATOR_STATE_ARRAY(shadow_isect, index, v);
+  isect->t = INTEGRATOR_STATE_ARRAY(shadow_isect, index, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE_ARRAY(shadow_isect, index, Ng);
+#endif
+}
+
+ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, object);
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, shader);
+    }
+  }
+}
+
+ccl_device_forceinline VolumeStack
+integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_CONST_ARGS, int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty(
+    INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(shadow_volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+ccl_device_forceinline void integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                       int i,
+                                                                       VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = entry.shader;
+}
+
+#if defined(__KERNEL_GPU__)
+ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state,
+                                                  const IntegratorState state)
+{
+  int index;
+
+  /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+#  define KERNEL_STRUCT_BEGIN(name) \
+    index = 0; \
+    do {
+
+#  define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct.name != nullptr) { \
+      kernel_integrator_state.parent_struct.name[to_state] = \
+          kernel_integrator_state.parent_struct.name[state]; \
+    }
+
+#  define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+      kernel_integrator_state.parent_struct[index].name[to_state] = \
+          kernel_integrator_state.parent_struct[index].name[state]; \
+    }
+
+#  define KERNEL_STRUCT_END(name) \
+    } \
+    while (false) \
+      ;
+
+#  define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+    ++index; \
+    } \
+    while (index < array_size) \
+      ;
+
+#  include "kernel/integrator/integrator_state_template.h"
+
+#  undef KERNEL_STRUCT_BEGIN
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_ARRAY_MEMBER
+#  undef KERNEL_STRUCT_END
+#  undef KERNEL_STRUCT_END_ARRAY
+}
+
+ccl_device_inline void integrator_state_move(const IntegratorState to_state,
+                                             const IntegratorState state)
+{
+  integrator_state_copy_only(to_state, state);
+
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
+
+#endif
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_ARGS)
+{
+#if defined(__KERNEL_GPU__)
+  const IntegratorState to_state = atomic_fetch_and_add_uint32(
+      &kernel_integrator_state.next_shadow_catcher_path_index[0], 1);
+
+  integrator_state_copy_only(to_state, state);
+
+  kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+  /* Sanity check: expect to split in the intersect-closest kernel, where there is no shadow ray
+   * and no sorting yet. */
+  kernel_assert(INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+  kernel_assert(kernel_integrator_state.sort_key_counter[INTEGRATOR_STATE(path, queued_kernel)] ==
+                nullptr);
+#else
+
+  IntegratorStateCPU *ccl_restrict split_state = state + 1;
+
+  *split_state = *state;
+
+  split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
new file mode 100644
index 00000000000..9490738404e
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -0,0 +1,623 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
+#include "kernel/closure/volume.h"
+
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+
+ccl_device int subsurface_bounce(INTEGRATOR_STATE_ARGS, ShaderData *sd, const ShaderClosure *sc)
+{
+  /* We should never have two consecutive BSSRDF bounces, the second one should
+   * be converted to a diffuse BSDF to avoid this. */
+  kernel_assert(!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DIFFUSE_ANCESTOR));
+
+  /* Setup path state for intersect_subsurface kernel. */
+  const Bssrdf *bssrdf = (const Bssrdf *)sc;
+
+  /* Setup ray into surface. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = sd->N;
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_zero_compact();
+
+  /* Pass along object info, reusing isect to save memory. */
+  INTEGRATOR_STATE_WRITE(isect, Ng) = sd->Ng;
+  INTEGRATOR_STATE_WRITE(isect, object) = sd->object;
+
+  /* Pass BSSRDF parameters. */
+  const uint32_t path_flag = INTEGRATOR_STATE_WRITE(path, flag);
+  INTEGRATOR_STATE_WRITE(path, flag) = (path_flag & ~PATH_RAY_CAMERA) | PATH_RAY_SUBSURFACE;
+  INTEGRATOR_STATE_WRITE(path, throughput) *= shader_bssrdf_sample_weight(sd, sc);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+    }
+  }
+
+  INTEGRATOR_STATE_WRITE(subsurface, albedo) = bssrdf->albedo;
+  INTEGRATOR_STATE_WRITE(subsurface, radius) = bssrdf->radius;
+  INTEGRATOR_STATE_WRITE(subsurface, roughness) = bssrdf->roughness;
+  INTEGRATOR_STATE_WRITE(subsurface, anisotropy) = bssrdf->anisotropy;
+
+  return LABEL_SUBSURFACE_SCATTER;
+}
+
+ccl_device void subsurface_shader_data_setup(INTEGRATOR_STATE_ARGS, ShaderData *sd)
+{
+  /* Get bump mapped normal from shader evaluation at exit point. */
+  float3 N = sd->N;
+  if (sd->flag & SD_HAS_BSSRDF_BUMP) {
+    N = shader_bssrdf_normal(sd);
+  }
+
+  /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
+  sd->flag &= ~SD_CLOSURE_FLAGS;
+  sd->num_closure = 0;
+  sd->num_closure_left = kernel_data.max_closures;
+
+  const float3 weight = one_float3();
+  const float roughness = INTEGRATOR_STATE(subsurface, roughness);
+
+#  ifdef __PRINCIPLED__
+  if (roughness != FLT_MAX) {
+    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
+        sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      bsdf->roughness = roughness;
+      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular Disney principled diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+    }
+  }
+  else
+#  endif /* __PRINCIPLED__ */
+  {
+    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      sd->flag |= bsdf_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+    }
+  }
+}
+
+/* Random walk subsurface scattering.
+ *
+ * "Practical and Controllable Subsurface Scattering for Production Path
+ *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+
+/* Support for anisotropy from:
+ * "Path Traced Subsurface Scattering using Anisotropic Phase Functions
+ * and Non-Exponential Free Flights".
+ * Magnus Wrenninge, Ryusuke Villemin, Christophe Hery.
+ * https://graphics.pixar.com/library/PathTracedSubsurface/ */
+
+ccl_device void subsurface_random_walk_remap(
+    const float albedo, const float d, float g, float *sigma_t, float *alpha)
+{
+  /* Compute attenuation and scattering coefficients from albedo. */
+  const float g2 = g * g;
+  const float g3 = g2 * g;
+  const float g4 = g3 * g;
+  const float g5 = g4 * g;
+  const float g6 = g5 * g;
+  const float g7 = g6 * g;
+
+  const float A = 1.8260523782f + -1.28451056436f * g + -1.79904629312f * g2 +
+                  9.19393289202f * g3 + -22.8215585862f * g4 + 32.0234874259f * g5 +
+                  -23.6264803333f * g6 + 7.21067002658f * g7;
+  const float B = 4.98511194385f +
+                  0.127355959438f *
+                      expf(31.1491581433f * g + -201.847017512f * g2 + 841.576016723f * g3 +
+                           -2018.09288505f * g4 + 2731.71560286f * g5 + -1935.41424244f * g6 +
+                           559.009054474f * g7);
+  const float C = 1.09686102424f + -0.394704063468f * g + 1.05258115941f * g2 +
+                  -8.83963712726f * g3 + 28.8643230661f * g4 + -46.8802913581f * g5 +
+                  38.5402837518f * g6 + -12.7181042538f * g7;
+  const float D = 0.496310210422f + 0.360146581622f * g + -2.15139309747f * g2 +
+                  17.8896899217f * g3 + -55.2984010333f * g4 + 82.065982243f * g5 +
+                  -58.5106008578f * g6 + 15.8478295021f * g7;
+  const float E = 4.23190299701f +
+                  0.00310603949088f *
+                      expf(76.7316253952f * g + -594.356773233f * g2 + 2448.8834203f * g3 +
+                           -5576.68528998f * g4 + 7116.60171912f * g5 + -4763.54467887f * g6 +
+                           1303.5318055f * g7);
+  const float F = 2.40602999408f + -2.51814844609f * g + 9.18494908356f * g2 +
+                  -79.2191708682f * g3 + 259.082868209f * g4 + -403.613804597f * g5 +
+                  302.85712436f * g6 + -87.4370473567f * g7;
+
+  const float blend = powf(albedo, 0.25f);
+
+  *alpha = (1.0f - blend) * A * powf(atanf(B * albedo), C) +
+           blend * D * powf(atanf(E * albedo), F);
+  *alpha = clamp(*alpha, 0.0f, 0.999999f);  // because of numerical precision
+
+  float sigma_t_prime = 1.0f / fmaxf(d, 1e-16f);
+  *sigma_t = sigma_t_prime / (1.0f - g);
+}
+
+ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
+                                                    const float3 radius,
+                                                    const float anisotropy,
+                                                    float3 *sigma_t,
+                                                    float3 *alpha,
+                                                    float3 *throughput)
+{
+  float sigma_t_x, sigma_t_y, sigma_t_z;
+  float alpha_x, alpha_y, alpha_z;
+
+  subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x);
+  subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y);
+  subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z);
+
+  /* Throughput already contains closure weight at this point, which includes the
+   * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo
+   * which will be added through scattering. */
+  *throughput = safe_divide_color(*throughput, albedo);
+
+  /* With low albedo values (like 0.025) we get diffusion_length 1.0 and
+   * infinite phase functions. To avoid a sharp discontinuity as we go from
+   * such values to 0.0, increase alpha and reduce the throughput to compensate. */
+  const float min_alpha = 0.2f;
+  if (alpha_x < min_alpha) {
+    (*throughput).x *= alpha_x / min_alpha;
+    alpha_x = min_alpha;
+  }
+  if (alpha_y < min_alpha) {
+    (*throughput).y *= alpha_y / min_alpha;
+    alpha_y = min_alpha;
+  }
+  if (alpha_z < min_alpha) {
+    (*throughput).z *= alpha_z / min_alpha;
+    alpha_z = min_alpha;
+  }
+
+  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
+  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
+}
+
+/* References for Dwivedi sampling:
+ *
+ * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
+ * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
+ * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
+ *
+ * [2] "Improving the Dwivedi Sampling Scheme"
+ * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
+ * https://cg.ivd.kit.edu/1951.php
+ *
+ * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
+ * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
+ * https://iliyan.com/publications/RenderingCourse2020
+ */
+
+ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
+{
+  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
+  return 1.0f / ((v - cos_theta) * phase_log);
+}
+
+ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
+{
+  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
+   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
+   * we can implement the power function like this. */
+  return v - (v + 1.0f) * expf(-rand * phase_log);
+}
+
+ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
+{
+  /* Eq. 67 from [3] */
+  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
+}
+
+ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
+{
+  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
+  float phi = M_2PI_F * randv;
+  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
+
+  float3 T, B;
+  make_orthonormals(D, &T, &B);
+  return dir.x * T + dir.y * B + dir.z * D;
+}
+
+ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
+                                                         float t,
+                                                         bool hit,
+                                                         float3 *transmittance)
+{
+  float3 T = volume_color_transmittance(sigma_t, t);
+  if (transmittance) {
+    *transmittance = T;
+  }
+  return hit ? T : sigma_t * T;
+}
+
+/* Define the below variable to get the similarity code active,
+ * and the value represents the cutoff level */
+#  define SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL 9
+
+ccl_device_inline bool subsurface_random_walk(INTEGRATOR_STATE_ARGS,
+                                              RNGState rng_state,
+                                              Ray &ray,
+                                              LocalIntersection &ss_isect)
+{
+  float bssrdf_u, bssrdf_v;
+  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+
+  const float3 P = INTEGRATOR_STATE(ray, P);
+  const float3 N = INTEGRATOR_STATE(ray, D);
+  const float ray_dP = INTEGRATOR_STATE(ray, dP);
+  const float time = INTEGRATOR_STATE(ray, time);
+  const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+  const int object = INTEGRATOR_STATE(isect, object);
+
+  /* Sample diffuse surface scatter into the object. */
+  float3 D;
+  float pdf;
+  sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf);
+  if (dot(-Ng, D) <= 0.0f) {
+    return false;
+  }
+
+  /* Setup ray. */
+  ray.P = ray_offset(P, -Ng);
+  ray.D = D;
+  ray.t = FLT_MAX;
+  ray.time = time;
+  ray.dP = ray_dP;
+  ray.dD = differential_zero_compact();
+
+#  ifndef __KERNEL_OPTIX__
+  /* Compute or fetch object transforms. */
+  Transform ob_itfm ccl_optional_struct_init;
+  Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
+#  endif
+
+  /* Convert subsurface to volume coefficients.
+   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
+  const float3 albedo = INTEGRATOR_STATE(subsurface, albedo);
+  const float3 radius = INTEGRATOR_STATE(subsurface, radius);
+  const float anisotropy = INTEGRATOR_STATE(subsurface, anisotropy);
+
+  float3 sigma_t, alpha;
+  float3 throughput = INTEGRATOR_STATE_WRITE(path, throughput);
+  subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput);
+  float3 sigma_s = sigma_t * alpha;
+
+  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
+   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
+   * for making the code significantly more complex and slower (if direction sampling depends on
+   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
+   *
+   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
+   * is too low results in fireflies while one that's too high just gives a bit more noise.
+   * Therefore, the code here uses the highest of the three albedos to be safe. */
+  const float diffusion_length = diffusion_length_dwivedi(max3(alpha));
+
+  if (diffusion_length == 1.0f) {
+    /* With specific values of alpha the length might become 1, which in asymptotic makes phase to
+     * be infinite. After first bounce it will cause throughput to be 0. Do early output, avoiding
+     * numerical issues and extra unneeded work. */
+    return false;
+  }
+
+  /* Precompute term for phase sampling. */
+  const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f));
+
+  /* Modify state for RNGs, decorrelated from other paths. */
+  rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
+
+  /* Random walk until we hit the surface again. */
+  bool hit = false;
+  bool have_opposite_interface = false;
+  float opposite_distance = 0.0f;
+
+  /* Todo: Disable for alpha>0.999 or so? */
+  /* Our heuristic, a compromise between guiding and classic. */
+  const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f));
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+  float3 sigma_s_star = sigma_s * (1.0f - anisotropy);
+  float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star;
+  float3 sigma_t_org = sigma_t;
+  float3 sigma_s_org = sigma_s;
+  const float anisotropy_org = anisotropy;
+  const float guided_fraction_org = guided_fraction;
+#  endif
+
+  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
+    /* Advance random number offset. */
+    rng_state.rng_offset += PRNG_BOUNCE_NUM;
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+    // shadow with local variables according to depth
+    float anisotropy, guided_fraction;
+    float3 sigma_s, sigma_t;
+    if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) {
+      anisotropy = anisotropy_org;
+      guided_fraction = guided_fraction_org;
+      sigma_t = sigma_t_org;
+      sigma_s = sigma_s_org;
+    }
+    else {
+      anisotropy = 0.0f;
+      guided_fraction = 0.75f;  // back to isotropic heuristic from Blender
+      sigma_t = sigma_t_star;
+      sigma_s = sigma_s_star;
+    }
+#  endif
+
+    /* Sample color channel, use MIS with balance heuristic. */
+    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL);
+    float3 channel_pdf;
+    int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
+    float sample_sigma_t = volume_channel_get(sigma_t, channel);
+    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE);
+
+    /* We need the result of the raycast to compute the full guided PDF, so just remember the
+     * relevant terms to avoid recomputing them later. */
+    float backward_fraction = 0.0f;
+    float forward_pdf_factor = 0.0f;
+    float forward_stretching = 1.0f;
+    float backward_pdf_factor = 0.0f;
+    float backward_stretching = 1.0f;
+
+    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
+    if (bounce > 0) {
+      /* Decide whether we should use guided or classic sampling. */
+      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction);
+
+      /* Determine if we want to sample away from the incoming interface.
+       * This only happens if we found a nearby opposite interface, and the probability for it
+       * depends on how close we are to it already.
+       * This probability term comes from the recorded presentation of [3]. */
+      bool guide_backward = false;
+      if (have_opposite_interface) {
+        /* Compute distance of the random walk between the tangent plane at the starting point
+         * and the assumed opposite interface (the parallel plane that contains the point we
+         * found in our ray query for the opposite side). */
+        float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance);
+        backward_fraction = 1.0f /
+                            (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length));
+        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction;
+      }
+
+      /* Sample scattering direction. */
+      float scatter_u, scatter_v;
+      path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+      float cos_theta;
+      float hg_pdf;
+      if (guided) {
+        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
+        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
+         * sign here is enough to sample from that instead. */
+        if (guide_backward) {
+          cos_theta = -cos_theta;
+        }
+        float3 newD = direction_from_cosine(N, cos_theta, scatter_v);
+        hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy);
+        ray.D = newD;
+      }
+      else {
+        float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf);
+        cos_theta = dot(newD, N);
+        ray.D = newD;
+      }
+
+      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
+       * Since phase sampling is channel-independent, we can get away with applying a factor
+       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
+       * it cancel with an equivalent term in the numerator of the full estimator.
+       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
+       */
+      forward_pdf_factor = M_1_2PI_F * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta) /
+                           hg_pdf;
+      backward_pdf_factor = M_1_2PI_F *
+                            eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta) / hg_pdf;
+
+      /* Prepare distance sampling.
+       * For the backwards case, this also needs the sign swapped since now directions against
+       * sd->N (and therefore with negative cos_theta) are preferred. */
+      forward_stretching = (1.0f - cos_theta / diffusion_length);
+      backward_stretching = (1.0f + cos_theta / diffusion_length);
+      if (guided) {
+        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
+      }
+    }
+
+    /* Sample direction along ray. */
+    float t = -logf(1.0f - randt) / sample_sigma_t;
+
+    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
+     * If yes, we will later use backwards guided sampling in order to have a decent
+     * chance of connecting to it.
+     * Todo: Maybe use less than 10 times the mean free path? */
+    ray.t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
+    scene_intersect_local(kg, &ray, &ss_isect, object, NULL, 1);
+    hit = (ss_isect.num_hits > 0);
+
+    if (hit) {
+#  ifdef __KERNEL_OPTIX__
+      /* t is always in world space with OptiX. */
+      ray.t = ss_isect.hits[0].t;
+#  else
+      /* Compute world space distance to surface hit. */
+      float3 D = transform_direction(&ob_itfm, ray.D);
+      D = normalize(D) * ss_isect.hits[0].t;
+      ray.t = len(transform_direction(&ob_tfm, D));
+#  endif
+    }
+
+    if (bounce == 0) {
+      /* Check if we hit the opposite side. */
+      if (hit) {
+        have_opposite_interface = true;
+        opposite_distance = dot(ray.P + ray.t * ray.D - P, -N);
+      }
+      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
+       * so check if there would have been a hit in that case. */
+      hit = ray.t < t;
+    }
+
+    /* Use the distance to the exit point for the throughput update if we found one. */
+    if (hit) {
+      t = ray.t;
+    }
+    else if (bounce == 0) {
+      /* Restore original position if nothing was hit after the first bounce,
+       * without the ray_offset() that was added to avoid self-intersection.
+       * Otherwise if that offset is relatively large compared to the scattering
+       * radius, we never go back up high enough to exit the surface. */
+      ray.P = P;
+    }
+
+    /* Advance to new scatter location. */
+    ray.P += t * ray.D;
+
+    float3 transmittance;
+    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
+    if (bounce > 0) {
+      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
+      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
+
+      if (have_opposite_interface) {
+        /* First step of MIS: Depending on geometry we might have two methods for guided
+         * sampling, so perform MIS between them. */
+        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
+        guided_pdf = mix(
+            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
+      }
+      else {
+        /* Just include phase sampling factor otherwise. */
+        guided_pdf *= forward_pdf_factor;
+      }
+
+      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
+      pdf = mix(pdf, guided_pdf, guided_fraction);
+    }
+
+    /* Finally, we're applying MIS again to combine the three color channels.
+     * Altogether, the MIS computation combines up to nine different estimators:
+     * {classic, guided, backward_guided} x {r, g, b} */
+    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
+
+    if (hit) {
+      /* If we hit the surface, we are done. */
+      break;
+    }
+    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
+      /* Avoid unnecessary work and precision issue when throughput gets really small. */
+      break;
+    }
+  }
+
+  if (hit) {
+    kernel_assert(isfinite3_safe(throughput));
+    INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+  }
+
+  return hit;
+}
+
+ccl_device_inline bool subsurface_scatter(INTEGRATOR_STATE_ARGS)
+{
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  Ray ray ccl_optional_struct_init;
+  LocalIntersection ss_isect ccl_optional_struct_init;
+
+  if (!subsurface_random_walk(INTEGRATOR_STATE_PASS, rng_state, ray, ss_isect)) {
+    return false;
+  }
+
+#  ifdef __VOLUME__
+  /* Update volume stack if needed. */
+  if (kernel_data.integrator.use_volumes) {
+    const int object = intersection_get_object(kg, &ss_isect.hits[0]);
+    const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+    if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) {
+      float3 P = INTEGRATOR_STATE(ray, P);
+      const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+      const float3 offset_P = ray_offset(P, -Ng);
+
+      integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_PASS, offset_P, ray.P);
+    }
+  }
+#  endif /* __VOLUME__ */
+
+  /* Pretend ray is coming from the outside towards the exit point. This ensures
+   * correct front/back facing normals.
+   * TODO: find a more elegant solution? */
+  ray.P += ray.D * ray.t * 2.0f;
+  ray.D = -ray.D;
+
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &ss_isect.hits[0]);
+  integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Advanced random number offset for bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
+
+  const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
+  const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+  if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                shader);
+  }
+
+  return true;
+}
+
+#endif /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h
new file mode 100644
index 00000000000..d53070095f0
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Volume Stack
+ *
+ * This is an array of object/shared ID's that the current segment of the path
+ * is inside of. */
+
+template<typename StackReadOp, typename StackWriteOp>
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS,
+                                        const ShaderData *sd,
+                                        StackReadOp stack_read,
+                                        StackWriteOp stack_write)
+{
+  /* todo: we should have some way for objects to indicate if they want the
+   * world shader to work inside them. excluding it by default is problematic
+   * because non-volume objects can't be assumed to be closed manifolds */
+  if (!(sd->flag & SD_HAS_VOLUME)) {
+    return;
+  }
+
+  if (sd->flag & SD_BACKFACING) {
+    /* Exit volume object: remove from stack. */
+    for (int i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      if (entry.object == sd->object) {
+        /* Shift back next stack entries. */
+        do {
+          entry = stack_read(i + 1);
+          stack_write(i, entry);
+          i++;
+        } while (entry.shader != SHADER_NONE);
+
+        return;
+      }
+    }
+  }
+  else {
+    /* Enter volume object: add to stack. */
+    int i;
+    for (i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      /* Already in the stack? then we have nothing to do. */
+      if (entry.object == sd->object) {
+        return;
+      }
+    }
+
+    /* If we exceed the stack limit, ignore. */
+    if (i >= VOLUME_STACK_SIZE - 1) {
+      return;
+    }
+
+    /* Add to the end of the stack. */
+    const VolumeStack new_entry = {sd->object, sd->shader};
+    const VolumeStack empty_entry = {OBJECT_NONE, SHADER_NONE};
+    stack_write(i, new_entry);
+    stack_write(i + 1, empty_entry);
+  }
+}
+
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) { return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+ccl_device void shadow_volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) {
+        return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+      },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+/* Clean stack after the last bounce.
+ *
+ * It is expected that all volumes are closed manifolds, so at the time when ray
+ * hits nothing (for example, it is a last bounce which goes to environment) the
+ * only expected volume in the stack is the world's one. All the rest volume
+ * entries should have been exited already.
+ *
+ * This isn't always true because of ray intersection precision issues, which
+ * could lead us to an infinite non-world volume in the stack, causing render
+ * artifacts.
+ *
+ * Use this function after the last bounce to get rid of all volumes apart from
+ * the world's one after the last bounce to avoid render artifacts.
+ */
+ccl_device_inline void volume_stack_clean(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    /* Keep the world's volume in stack. */
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
+  }
+  else {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = SHADER_NONE;
+  }
+}
+
+template<typename StackReadOp>
+ccl_device float volume_stack_step_size(INTEGRATOR_STATE_ARGS, StackReadOp stack_read)
+{
+  float step_size = FLT_MAX;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    bool heterogeneous = false;
+
+    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
+      heterogeneous = true;
+    }
+    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
+      /* We want to render world or objects without any volume grids
+       * as homogeneous, but can only verify this at run-time since other
+       * heterogeneous volume objects may be using the same shader. */
+      int object = entry.object;
+      if (object != OBJECT_NONE) {
+        int object_flag = kernel_tex_fetch(__object_flag, object);
+        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
+          heterogeneous = true;
+        }
+      }
+    }
+
+    if (heterogeneous) {
+      float object_step_size = object_volume_step_size(kg, entry.object);
+      object_step_size *= kernel_data.integrator.volume_step_rate;
+      step_size = fminf(object_step_size, step_size);
+    }
+  }
+
+  return step_size;
+}
+
+typedef enum VolumeSampleMethod {
+  VOLUME_SAMPLE_NONE = 0,
+  VOLUME_SAMPLE_DISTANCE = (1 << 0),
+  VOLUME_SAMPLE_EQUIANGULAR = (1 << 1),
+  VOLUME_SAMPLE_MIS = (VOLUME_SAMPLE_DISTANCE | VOLUME_SAMPLE_EQUIANGULAR),
+} VolumeSampleMethod;
+
+ccl_device VolumeSampleMethod volume_stack_sample_method(INTEGRATOR_STATE_ARGS)
+{
+  VolumeSampleMethod method = VOLUME_SAMPLE_NONE;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    if (shader_flag & SD_VOLUME_MIS) {
+      /* Multiple importance sampling. */
+      return VOLUME_SAMPLE_MIS;
+    }
+    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_DISTANCE) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Only equiangular sampling. */
+      method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+    else {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_EQUIANGULAR) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Distance sampling only. */
+      method = VOLUME_SAMPLE_DISTANCE;
+    }
+  }
+
+  return method;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 61653d328f1..9e12d24dcf4 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -14,751 +14,501 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_adaptive_sampling.h"
+#include "kernel_random.h"
+#include "kernel_shadow_catcher.h"
+#include "kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* BSDF Eval
+/* --------------------------------------------------------------------
+ * BSDF Evaluation
  *
- * BSDF evaluation result, split per BSDF type. This is used to accumulate
- * render passes separately. */
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd);
+ * BSDF evaluation result, split between diffuse and glossy. This is used to
+ * accumulate render passes separately. Note that reflection, transmission
+ * and volume scattering are written to different render passes, but we assume
+ * that only one of those can happen at a bounce, and so do not need to accumulate
+ * them separately. */
 
-ccl_device_inline void bsdf_eval_init(BsdfEval *eval,
-                                      ClosureType type,
-                                      float3 value,
-                                      int use_light_pass)
+ccl_device_inline void bsdf_eval_init(BsdfEval *eval, const bool is_diffuse, float3 value)
 {
-#ifdef __PASSES__
-  eval->use_light_pass = use_light_pass;
-
-  if (eval->use_light_pass) {
-    eval->diffuse = zero_float3();
-    eval->glossy = zero_float3();
-    eval->transmission = zero_float3();
-    eval->transparent = zero_float3();
-    eval->volume = zero_float3();
-
-    if (type == CLOSURE_BSDF_TRANSPARENT_ID)
-      eval->transparent = value;
-    else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse = value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy = value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission = value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume = value;
-  }
-  else
-#endif
-  {
+  eval->diffuse = zero_float3();
+  eval->glossy = zero_float3();
+
+  if (is_diffuse) {
     eval->diffuse = value;
   }
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis = zero_float3();
-#endif
+  else {
+    eval->glossy = value;
+  }
 }
 
 ccl_device_inline void bsdf_eval_accum(BsdfEval *eval,
-                                       ClosureType type,
+                                       const bool is_diffuse,
                                        float3 value,
                                        float mis_weight)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis += value;
-#endif
   value *= mis_weight;
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse += value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy += value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission += value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume += value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse += value;
-  }
-}
 
-ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
-{
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) &&
-           is_zero(eval->transparent) && is_zero(eval->volume);
+  if (is_diffuse) {
+    eval->diffuse += value;
   }
-  else
-#endif
-  {
-    return is_zero(eval->diffuse);
+  else {
+    eval->glossy += value;
   }
 }
 
-ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
+ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse *= value;
-  }
+  return is_zero(eval->diffuse) && is_zero(eval->glossy);
 }
 
 ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-  bsdf_eval_mis(eval, value);
+  eval->diffuse *= value;
+  eval->glossy *= value;
 }
 
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-    eval->diffuse *= value;
-#else
   eval->diffuse *= value;
-#endif
+  eval->glossy *= value;
 }
 
 ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return eval->diffuse + eval->glossy + eval->transmission + eval->volume;
-  }
-  else
-#endif
-    return eval->diffuse;
+  return eval->diffuse + eval->glossy;
 }
 
-/* Path Radiance
- *
- * We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definite directly
- * visible as the first non-transparent hit, while indirectly visible are the
- * bounces after that. */
-
-ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(const BsdfEval *eval)
 {
-  /* clear all */
-#ifdef __PASSES__
-  L->use_light_pass = kernel_data.film.use_light_pass;
-
-  if (kernel_data.film.use_light_pass) {
-    L->indirect = zero_float3();
-    L->direct_emission = zero_float3();
-
-    L->color_diffuse = zero_float3();
-    L->color_glossy = zero_float3();
-    L->color_transmission = zero_float3();
-
-    L->direct_diffuse = zero_float3();
-    L->direct_glossy = zero_float3();
-    L->direct_transmission = zero_float3();
-    L->direct_volume = zero_float3();
-
-    L->indirect_diffuse = zero_float3();
-    L->indirect_glossy = zero_float3();
-    L->indirect_transmission = zero_float3();
-    L->indirect_volume = zero_float3();
-
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-    L->background = zero_float3();
-    L->ao = zero_float3();
-    L->shadow = zero_float3();
-    L->mist = 0.0f;
-
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
-    L->state.direct = zero_float3();
-  }
-  else
-#endif
-  {
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-  }
-
-#ifdef __SHADOW_TRICKS__
-  L->path_total = zero_float3();
-  L->path_total_shaded = zero_float3();
-  L->shadow_background_color = zero_float3();
-  L->shadow_throughput = 0.0f;
-  L->shadow_transparency = 1.0f;
-  L->has_shadow_catcher = 0;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  L->denoising_normal = zero_float3();
-  L->denoising_albedo = zero_float3();
-  L->denoising_depth = 0.0f;
-#endif
+  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
+   * We assume reflection, transmission and volume scatter to be exclusive. */
+  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
-                                                 PathRadianceState *L_state,
-                                                 ccl_addr_space float3 *throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float bsdf_pdf,
-                                                 int bounce,
-                                                 int bsdf_label)
-{
-  float inverse_pdf = 1.0f / bsdf_pdf;
-
-#ifdef __PASSES__
-  if (kernel_data.film.use_light_pass) {
-    if (bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
-      /* first on directly visible surface */
-      float3 value = *throughput * inverse_pdf;
-
-      L_state->diffuse = bsdf_eval->diffuse * value;
-      L_state->glossy = bsdf_eval->glossy * value;
-      L_state->transmission = bsdf_eval->transmission * value;
-      L_state->volume = bsdf_eval->volume * value;
-
-      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume;
+/* --------------------------------------------------------------------
+ * Clamping
+ *
+ * Clamping is done on a per-contribution basis so that we can write directly
+ * to render buffers instead of using per-thread memory, and to avoid the
+ * impact of clamping on other contributions. */
 
-      L_state->direct = *throughput;
-    }
-    else {
-      /* transparent bounce before first hit, or indirectly visible through BSDF */
-      float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf;
-      *throughput *= sum;
-    }
+ccl_device_forceinline void kernel_accum_clamp(const KernelGlobals *kg, float3 *L, int bounce)
+{
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(*L)) {
+    kernel_assert(!"Cycles sample with non-finite value detected");
   }
-  else
 #endif
-  {
-    *throughput *= bsdf_eval->diffuse * inverse_pdf;
-  }
-}
+  /* Make sure all components are finite, allowing the contribution to be usable by adaptive
+   * sampling convergence check, but also to make it so render result never causes issues with
+   * post-processing. */
+  *L = ensure_finite3(*L);
 
 #ifdef __CLAMP_SAMPLE__
-ccl_device_forceinline void path_radiance_clamp(KernelGlobals *kg, float3 *L, int bounce)
-{
   float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
                                kernel_data.integrator.sample_clamp_direct;
   float sum = reduce_add(fabs(*L));
   if (sum > limit) {
     *L *= limit / sum;
   }
+#endif
 }
 
-ccl_device_forceinline void path_radiance_clamp_throughput(KernelGlobals *kg,
-                                                           float3 *L,
-                                                           float3 *throughput,
-                                                           int bounce)
-{
-  float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
-                               kernel_data.integrator.sample_clamp_direct;
+/* --------------------------------------------------------------------
+ * Pass accumulation utilities.
+ */
 
-  float sum = reduce_add(fabs(*L));
-  if (sum > limit) {
-    float clamp_factor = limit / sum;
-    *L *= clamp_factor;
-    *throughput *= clamp_factor;
-  }
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-#endif
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
 
-ccl_device_inline void path_radiance_accum_emission(KernelGlobals *kg,
-                                                    PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 value)
+ccl_device_inline int kernel_accum_sample(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer,
+                                          int sample)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    return;
+  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
+    return sample;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0)
-      L->emission += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
-  }
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
 }
 
-ccl_device_inline void path_radiance_accum_ao(KernelGlobals *kg,
-                                              PathRadiance *L,
-                                              ccl_addr_space PathState *state,
-                                              float3 throughput,
-                                              float3 alpha,
-                                              float3 bsdf,
-                                              float3 ao)
+ccl_device void kernel_accum_adaptive_buffer(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 contribution,
+                                             ccl_global float *ccl_restrict buffer)
 {
-#ifdef __PASSES__
-  /* Store AO pass. */
-  if (L->use_light_pass && state->bounce == 0) {
-    L->ao += alpha * throughput * ao;
-  }
-#endif
-
-#ifdef __SHADOW_TRICKS__
-  /* For shadow catcher, accumulate ratio. */
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf;
-    L->path_total += light;
-    L->path_total_shaded += ao * light;
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+   * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+   * Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+   * tiles. */
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return;
   }
-#endif
-
-  float3 contribution = throughput * bsdf * ao;
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0) {
-      /* Directly visible lighting. */
-      L->direct_diffuse += contribution;
-    }
-    else {
-      /* Indirectly visible lighting after BSDF bounce. */
-      L->indirect += contribution;
-    }
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  const int sample = INTEGRATOR_STATE(path, sample);
+  if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_adaptive_aux_buffer,
+        make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f));
   }
 }
 
-ccl_device_inline void path_radiance_accum_total_ao(PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 bsdf)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf;
-#endif
-}
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+#ifdef __SHADOW_CATCHER__
 
-ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg,
-                                                 PathRadiance *L,
-                                                 ccl_addr_space PathState *state,
-                                                 float3 throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float3 shadow,
-                                                 float shadow_fac,
-                                                 bool is_lamp)
+/* Accumulate contribution to the Shadow Catcher pass.
+ *
+ * Returns truth if the contribution is fully handled here and is not to be added to the other
+ * passes (like combined, adaptive sampling). */
+
+ccl_device bool kernel_accum_shadow_catcher(INTEGRATOR_STATE_CONST_ARGS,
+                                            const float3 contribution,
+                                            ccl_global float *ccl_restrict buffer)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf_eval->sum_no_mis;
-    L->path_total += light;
-    L->path_total_shaded += shadow * light;
-
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
   }
-#endif
 
-  float3 shaded_throughput = throughput * shadow;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    /* Compute the clamping based on the total contribution.
-     * The resulting scale is then be applied to all individual components. */
-    float3 full_contribution = shaded_throughput * bsdf_eval_sum(bsdf_eval);
-#  ifdef __CLAMP_SAMPLE__
-    path_radiance_clamp_throughput(kg, &full_contribution, &shaded_throughput, state->bounce);
-#  endif
-
-    if (state->bounce == 0) {
-      /* directly visible lighting */
-      L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse;
-      L->direct_glossy += shaded_throughput * bsdf_eval->glossy;
-      L->direct_transmission += shaded_throughput * bsdf_eval->transmission;
-      L->direct_volume += shaded_throughput * bsdf_eval->volume;
-
-      if (is_lamp) {
-        L->shadow += shadow * shadow_fac;
-      }
-    }
-    else {
-      /* indirectly visible lighting after BSDF bounce */
-      L->indirect += full_contribution;
-    }
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
   }
-  else
-#endif
-  {
-    float3 contribution = shaded_throughput * bsdf_eval->diffuse;
-    path_radiance_clamp(kg, &contribution, state->bounce);
-    L->emission += contribution;
+
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
-}
 
-ccl_device_inline void path_radiance_accum_total_light(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput,
-                                                       const BsdfEval *bsdf_eval)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf_eval->sum_no_mis;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf_eval;
-#endif
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg,
-                                                      PathRadiance *L,
-                                                      ccl_addr_space PathState *state,
-                                                      float3 throughput,
-                                                      float3 value)
+ccl_device bool kernel_accum_shadow_catcher_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                        const float3 contribution,
+                                                        const float transparent,
+                                                        ccl_global float *ccl_restrict buffer)
 {
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
 
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * value;
-    L->path_total_shaded += throughput * value * L->shadow_transparency;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    return true;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_shadow_catcher_matte,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
+  }
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)
-      L->background += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the
+     * calculation and the alpha channel of the pass contains numbers of samples contributed to a
+     * pixel of the pass. */
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
 
-#ifdef __DENOISING_FEATURES__
-  L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput *
-                         value;
-#endif /* __DENOISING_FEATURES__ */
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_transparent(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput)
+ccl_device void kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_CONST_ARGS,
+                                                             const float transparent,
+                                                             ccl_global float *ccl_restrict buffer)
 {
-  L->transparent += average(throughput);
-}
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_accum_shadowcatcher(PathRadiance *L,
-                                                         float3 throughput,
-                                                         float3 background)
-{
-  L->shadow_throughput += average(throughput);
-  L->shadow_background_color += throughput * background;
-  L->has_shadow_catcher = 1;
-}
-#endif
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  /* this division is a bit ugly, but means we only have to keep track of
-   * only a single throughput further along the path, here we recover just
-   * the indirect path that is not influenced by any particular BSDF type */
-  if (L->use_light_pass) {
-    L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
-    L->direct_diffuse += L->state.diffuse * L->direct_emission;
-    L->direct_glossy += L->state.glossy * L->direct_emission;
-    L->direct_transmission += L->state.transmission * L->direct_emission;
-    L->direct_volume += L->state.volume * L->direct_emission;
-
-    L->indirect = safe_divide_color(L->indirect, L->state.direct);
-    L->indirect_diffuse += L->state.diffuse * L->indirect;
-    L->indirect_glossy += L->state.glossy * L->indirect;
-    L->indirect_transmission += L->state.transmission * L->indirect;
-    L->indirect_volume += L->state.volume * L->indirect;
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
   }
-#endif
 }
 
-ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
+#endif /* __SHADOW_CATCHER__ */
+
+/* --------------------------------------------------------------------
+ * Render passes.
+ */
 
-    L->direct_emission = zero_float3();
-    L->indirect = zero_float3();
+/* Write combined pass. */
+ccl_device_inline void kernel_accum_combined_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                  const float3 contribution,
+                                                  ccl_global float *ccl_restrict buffer)
+{
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher(INTEGRATOR_STATE_PASS, contribution, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution);
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, const PathRadiance *L_src)
+/* Write combined pass with transparency. */
+ccl_device_inline void kernel_accum_combined_transparent_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                              const float3 contribution,
+                                                              const float transparent,
+                                                              ccl_global float *ccl_restrict
+                                                                  buffer)
 {
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state = L_src->state;
-
-    L->direct_emission = L_src->direct_emission;
-    L->indirect = L_src->indirect;
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher_transparent(
+          INTEGRATOR_STATE_PASS, contribution, transparent, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_combined,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
-                                                       PathRadiance *L,
-                                                       float3 *L_sum,
-                                                       float *alpha)
+/* Write background or emission to appropriate pass. */
+ccl_device_inline void kernel_accum_emission_or_background_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                                float3 contribution,
+                                                                ccl_global float *ccl_restrict
+                                                                    buffer,
+                                                                const int pass)
 {
-  /* Calculate current shadow of the path. */
-  float path_total = average(L->path_total);
-  float shadow;
+  if (!(kernel_data.film.light_pass_flag & PASS_ANY)) {
+    return;
+  }
 
-  if (UNLIKELY(!isfinite_safe(path_total))) {
-#  ifdef __KERNEL_DEBUG_NAN__
-    kernel_assert(!"Non-finite total radiance along the path");
-#  endif
-    shadow = 0.0f;
+#ifdef __PASSES__
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  int pass_offset = PASS_UNUSED;
+
+  /* Denoising albedo. */
+#  ifdef __DENOISING_FEATURES__
+  if (path_flag & PATH_RAY_DENOISING_FEATURES) {
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = denoising_feature_throughput * contribution;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
   }
-  else if (path_total == 0.0f) {
-    shadow = L->shadow_transparency;
+#  endif /* __DENOISING_FEATURES__ */
+
+  if (!(path_flag & PATH_RAY_ANY_PASS)) {
+    /* Directly visible, write to emission or background pass. */
+    pass_offset = pass;
+  }
+  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+    /* Indirectly visible through reflection. */
+    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect) :
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_transmission_direct :
+                                            kernel_data.film.pass_transmission_indirect);
+
+    if (glossy_pass_offset != PASS_UNUSED) {
+      /* Glossy is a subset of the throughput, reconstruct it here using the
+       * diffuse-glossy ratio. */
+      const float3 ratio = INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+    }
+
+    /* Reconstruct diffuse subset of throughput. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_diffuse_direct :
+                                                          kernel_data.film.pass_diffuse_indirect;
+    if (pass_offset != PASS_UNUSED) {
+      contribution *= INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    }
   }
-  else {
-    float path_total_shaded = average(L->path_total_shaded);
-    shadow = path_total_shaded / path_total;
+  else if (path_flag & PATH_RAY_VOLUME_PASS) {
+    /* Indirectly visible through volume. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_volume_direct :
+                                                          kernel_data.film.pass_volume_indirect;
   }
 
-  /* Calculate final light sum and transparency for shadow catcher object. */
-  if (kernel_data.background.transparent) {
-    *alpha -= L->shadow_throughput * shadow;
-  }
-  else {
-    L->shadow_background_color *= shadow;
-    *L_sum += L->shadow_background_color;
+  /* Single write call for GPU coherence. */
+  if (pass_offset != PASS_UNUSED) {
+    kernel_write_pass_float3(buffer + pass_offset, contribution);
   }
+#endif /* __PASSES__ */
 }
-#endif
 
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float *alpha)
+/* Write light contribution to render buffer. */
+ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer)
 {
-  float3 L_sum;
-  /* Light Passes are used */
+  /* The throughput for shadow paths already contains the light shader evaluation. */
+  float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
+
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
+
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+
 #ifdef __PASSES__
-  float3 L_direct, L_indirect;
-  if (L->use_light_pass) {
-    path_radiance_sum_indirect(L);
-
-    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume +
-               L->emission;
-    L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission +
-                 L->indirect_volume;
-
-    if (!kernel_data.background.transparent)
-      L_direct += L->background;
-
-    L_sum = L_direct + L_indirect;
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-
-    /* Reject invalid value */
-    if (!isfinite_safe(sum)) {
-#  ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
-#  endif
-      L_sum = zero_float3();
-
-      L->direct_diffuse = zero_float3();
-      L->direct_glossy = zero_float3();
-      L->direct_transmission = zero_float3();
-      L->direct_volume = zero_float3();
-
-      L->indirect_diffuse = zero_float3();
-      L->indirect_glossy = zero_float3();
-      L->indirect_transmission = zero_float3();
-      L->indirect_volume = zero_float3();
-
-      L->emission = zero_float3();
+  if (kernel_data.film.light_pass_flag & PASS_ANY) {
+    const int path_flag = INTEGRATOR_STATE(shadow_path, flag);
+    int pass_offset = PASS_UNUSED;
+
+    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+      /* Indirectly visible through reflection. */
+      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_glossy_direct :
+                                              kernel_data.film.pass_glossy_indirect) :
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_transmission_direct :
+                                              kernel_data.film.pass_transmission_indirect);
+
+      if (glossy_pass_offset != PASS_UNUSED) {
+        /* Glossy is a subset of the throughput, reconstruct it here using the
+         * diffuse-glossy ratio. */
+        const float3 ratio = INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+      }
+
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
+      if (pass_offset != PASS_UNUSED) {
+        contribution *= INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+      }
+    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
     }
-  }
 
-  /* No Light Passes */
-  else
-#endif
-  {
-    L_sum = L->emission;
+    /* Single write call for GPU coherence. */
+    if (pass_offset != PASS_UNUSED) {
+      kernel_write_pass_float3(buffer + pass_offset, contribution);
+    }
 
-    /* Reject invalid value */
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-    if (!isfinite_safe(sum)) {
-#ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-#endif
-      L_sum = zero_float3();
+    /* Write shadow pass. */
+    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
+        (path_flag & PATH_RAY_CAMERA)) {
+      const float3 unshadowed_throughput = INTEGRATOR_STATE(shadow_path, unshadowed_throughput);
+      const float3 shadowed_throughput = INTEGRATOR_STATE(shadow_path, throughput);
+      const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) *
+                            kernel_data.film.pass_shadow_scale;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow);
     }
   }
+#endif
+}
 
-  /* Compute alpha. */
-  *alpha = 1.0f - L->transparent;
+/* Write transparency to render buffer.
+ *
+ * Note that we accumulate transparency = 1 - alpha in the render buffer.
+ * Otherwise we'd have to write alpha on path termination, which happens
+ * in many places. */
+ccl_device_inline void kernel_accum_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                const float transparent,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  /* Add shadow catcher contributions. */
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
   }
-#endif /* __SHADOW_TRICKS__ */
 
-  return L_sum;
+  kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_PASS, transparent, buffer);
 }
 
-ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float3 *noisy,
-                                                     float3 *clean)
+/* Write background contribution to render buffer.
+ *
+ * Includes transparency, matching kernel_accum_transparent. */
+ccl_device_inline void kernel_accum_background(INTEGRATOR_STATE_CONST_ARGS,
+                                               const float3 L,
+                                               const float transparent,
+                                               const bool is_transparent_background_ray,
+                                               ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __PASSES__
-  kernel_assert(L->use_light_pass);
-
-  *clean = L->emission + L->background;
-  *noisy = L->direct_volume + L->indirect_volume;
-
-#  define ADD_COMPONENT(flag, component) \
-    if (kernel_data.film.denoising_flags & flag) \
-      *clean += component; \
-    else \
-      *noisy += component;
-
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
-#  undef ADD_COMPONENT
-#else
-  *noisy = L->emission;
-  *clean = zero_float3();
-#endif
+  float3 contribution = INTEGRATOR_STATE(path, throughput) * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    *noisy += L->shadow_background_color;
-  }
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  *noisy = ensure_finite3(*noisy);
-  *clean = ensure_finite3(*clean);
+  if (is_transparent_background_ray) {
+    kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+  }
+  else {
+    kernel_accum_combined_transparent_pass(
+        INTEGRATOR_STATE_PASS, contribution, transparent, buffer);
+  }
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_background);
 }
 
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
+/* Write emission to render buffer. */
+ccl_device_inline void kernel_accum_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 throughput,
+                                             const float3 L,
+                                             ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __SPLIT_KERNEL__
-#  define safe_float3_add(f, v) \
-    do { \
-      ccl_global float *p = (ccl_global float *)(&(f)); \
-      atomic_add_and_fetch_float(p + 0, (v).x); \
-      atomic_add_and_fetch_float(p + 1, (v).y); \
-      atomic_add_and_fetch_float(p + 2, (v).z); \
-    } while (0)
-#  define safe_float_add(f, v) atomic_add_and_fetch_float(&(f), (v))
-#else
-#  define safe_float3_add(f, v) (f) += (v)
-#  define safe_float_add(f, v) (f) += (v)
-#endif /* __SPLIT_KERNEL__ */
+  float3 contribution = throughput * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __PASSES__
-  safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
-  safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
-  safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
-  safe_float3_add(L->direct_volume, L_sample->direct_volume);
-
-  safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
-  safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
-  safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
-  safe_float3_add(L->indirect_volume, L_sample->indirect_volume);
-
-  safe_float3_add(L->background, L_sample->background);
-  safe_float3_add(L->ao, L_sample->ao);
-  safe_float3_add(L->shadow, L_sample->shadow);
-  safe_float_add(L->mist, L_sample->mist);
-#endif /* __PASSES__ */
-  safe_float3_add(L->emission, L_sample->emission);
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#undef safe_float_add
-#undef safe_float3_add
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_emission);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
index 98b7bf7e7dc..2bee12f0473 100644
--- a/intern/cycles/kernel/kernel_adaptive_sampling.h
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -14,226 +14,146 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
-#define __KERNEL_ADAPTIVE_SAMPLING_H__
+#pragma once
+
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+/* Check whether the pixel has converged and should not be sampled anymore. */
 
-ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            int sample)
+ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS,
+                                                     ccl_global float *render_buffer)
 {
-  /* TODO Stefan: Is this better in linear, sRGB or something else? */
-  float4 I = *((ccl_global float4 *)buffer);
-  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-  /* The per pixel error as seen in section 2.1 of
-   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
-   * A small epsilon is added to the divisor to prevent division by zero. */
-  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
-                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
-  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
-    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
-    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return true;
   }
-}
-
-/* Adjust the values of an adaptively sampled pixel. */
-
-ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            float sample_multiplier)
-{
-  *(ccl_global float4 *)(buffer) *= sample_multiplier;
 
-  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
-  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
-  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
 
-#ifdef __PASSES__
-  int flag = kernel_data.film.pass_flag;
-
-  if (flag & PASSMASK(NORMAL))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  return buffer[aux_w_offset] == 0.0f;
+}
 
-  if (flag & PASSMASK(UV))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
 
-  if (flag & PASSMASK(MOTION)) {
-    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
-    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int y,
+                                                           float threshold,
+                                                           bool reset,
+                                                           int offset,
+                                                           int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+  const int render_pixel_index = offset + x + y * stride;
+  ccl_global float *buffer = render_buffer +
+                             (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+
+  /* TODO(Stefan): Is this better in linear, sRGB or something else? */
+
+  const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  if (!reset && A.w != 0.0f) {
+    /* If the pixel was considered converged, its state will not change in this kernmel. Early
+     * output before doing any math.
+     *
+     * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
+    return true;
   }
 
-  if (kernel_data.film.use_light_pass) {
-    int light_flag = kernel_data.film.light_pass_flag;
-
-    if (light_flag & PASSMASK(MIST))
-      *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
-
-    /* Shadow pass omitted on purpose. It has its own scale parameter. */
-
-    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(EMISSION))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
-    if (light_flag & PASSMASK(BACKGROUND))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
-    if (light_flag & PASSMASK(AO))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(DIFFUSE_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
-  }
-#endif
-
-#ifdef __DENOISING_FEATURES__
-
-#  define scale_float3_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale; \
-    *(buffer + offset + 3) *= scale * scale; \
-    *(buffer + offset + 4) *= scale * scale; \
-    *(buffer + offset + 5) *= scale * scale;
-
-#  define scale_shadow_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale * scale;
-
-  if (kernel_data.film.pass_denoising_data) {
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
-    if (kernel_data.film.pass_denoising_clean) {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
-    }
-    else {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-    }
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
-      1) *= sample_multiplier * sample_multiplier;
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Cryptomatte. */
-  if (kernel_data.film.cryptomatte_passes) {
-    int num_slots = 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
-    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
-    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
-                                                         kernel_data.film.pass_cryptomatte);
-    for (int slot = 0; slot < num_slots; slot++) {
-      id_buffer[slot].y *= sample_multiplier;
-    }
-  }
+  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
 
-  /* AOVs. */
-  for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) {
-    *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier;
-  }
-  for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) {
-    *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier;
-  }
+  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+  const float inv_sample = 1.0f / sample;
+
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
+  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
+                                 inv_sample;
+  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+  /* A small epsilon is added to the divisor to prevent division by zero. */
+  const float error = error_difference / (0.0001f + error_normalize);
+  const bool did_converge = (error < threshold);
+
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  buffer[aux_w_offset] = did_converge;
+
+  return did_converge;
 }
 
 /* This is a simple box filter in two passes.
  * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
 
-ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int y,
+                                                  int start_x,
+                                                  int width,
+                                                  int offset,
+                                                  int stride)
 {
-  bool any = false;
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  for (int x = tile->x; x < tile->x + tile->w; ++x) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (x > tile->x && !prev) {
+  for (int x = start_x; x < start_x + width; ++x) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (x > start_x && !prev) {
         index = index - 1;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
-ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int x,
+                                                  int start_y,
+                                                  int height,
+                                                  int offset,
+                                                  int stride)
 {
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  bool any = false;
-  for (int y = tile->y; y < tile->y + tile->h; ++y) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (y > tile->y && !prev) {
-        index = index - tile->stride;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+  for (int y = start_y; y < start_y + height; ++y) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (y > start_y && !prev) {
+        index = index - stride;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 7da890b908d..e025bcd6674 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -14,502 +14,62 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BAKING__
-
-ccl_device_noinline void compute_light_pass(
-    KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample)
-{
-  kernel_assert(kernel_data.film.use_light_pass);
-
-  float3 throughput = one_float3();
-
-  /* Emission and indirect shader data memory used by various functions. */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  /* Init radiance. */
-  path_radiance_init(kg, L);
-
-  /* Init path state. */
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL);
-
-  /* Evaluate surface shader. */
-  shader_eval_surface(kg, sd, &state, NULL, state.flag);
-
-  /* TODO: disable more closures we don't need besides transparent. */
-  shader_bsdf_disable_transparency(kg, sd);
-
-  /* Init ray. */
-  Ray ray;
-  ray.P = sd->P + sd->Ng;
-  ray.D = -sd->Ng;
-  ray.t = FLT_MAX;
-#  ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#  endif
-
-#  ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched) {
-    /* regular path tracer */
-#  endif
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd));
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-    bool is_sss_sample = false;
-
-#  ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      SubsurfaceIndirectRays ss_indirect;
-      kernel_path_subsurface_init_indirect(&ss_indirect);
-      if (kernel_path_subsurface_scatter(
-              kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) {
-        while (ss_indirect.num_rays) {
-          kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput);
-          kernel_path_indirect(
-              kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-        }
-        is_sss_sample = true;
-      }
-    }
-#  endif
-
-    /* sample light and BSDF */
-    if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-      kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L);
-
-      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) {
-#  ifdef __LAMP_MIS__
-        state.ray_t = 0.0f;
-#  endif
-        /* compute indirect light */
-        kernel_path_indirect(
-            kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-
-        /* sum and reset indirect light pass variables for the next samples */
-        path_radiance_sum_indirect(L);
-        path_radiance_reset_indirect(L);
-      }
-    }
-#  ifdef __BRANCHED_PATH__
-  }
-  else {
-    /* branched path tracer */
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput);
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-#    ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      kernel_branched_path_subsurface_scatter(
-          kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-    }
-#    endif
-
-    /* sample light and BSDF */
-    if (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT)) {
-#    if defined(__EMISSION__)
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = kernel_data.integrator.sample_all_lights_direct;
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, &state, throughput, 1.0f, L, all);
-      }
-#    endif
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L);
-    }
-  }
-#  endif
-}
-
-/* this helps with AA but it's not the real solution as it does not AA the geometry
- *  but it's better than nothing, thus committed */
-ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
-{
-  /* use mirror repeat (like opengl texture) so that if the barycentric
-   * coordinate goes past the end of the triangle it is not always clamped
-   * to the same value, gives ugly patterns */
-  u /= max;
-  float fu = floorf(u);
-  u = u - fu;
-
-  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
-}
-
-ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
-                                                 ShaderData *sd,
-                                                 const ShaderEvalType type)
-{
-  switch (type) {
-    case SHADER_EVAL_DIFFUSE:
-      return shader_bsdf_diffuse(kg, sd);
-    case SHADER_EVAL_GLOSSY:
-      return shader_bsdf_glossy(kg, sd);
-    case SHADER_EVAL_TRANSMISSION:
-      return shader_bsdf_transmission(kg, sd);
-    default:
-      kernel_assert(!"Unknown bake type passed to BSDF evaluate");
-      return zero_float3();
-  }
-}
-
-ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       PathState *state,
-                                                       float3 direct,
-                                                       float3 indirect,
-                                                       const ShaderEvalType type,
-                                                       const int pass_filter)
-{
-  float3 color;
-  const bool is_color = (pass_filter & BAKE_FILTER_COLOR) != 0;
-  const bool is_direct = (pass_filter & BAKE_FILTER_DIRECT) != 0;
-  const bool is_indirect = (pass_filter & BAKE_FILTER_INDIRECT) != 0;
-  float3 out = zero_float3();
-
-  if (is_color) {
-    if (is_direct || is_indirect) {
-      /* Leave direct and diffuse channel colored. */
-      color = one_float3();
-    }
-    else {
-      /* surface color of the pass only */
-      shader_eval_surface(kg, sd, state, NULL, 0);
-      return kernel_bake_shader_bsdf(kg, sd, type);
-    }
-  }
-  else {
-    shader_eval_surface(kg, sd, state, NULL, 0);
-    color = kernel_bake_shader_bsdf(kg, sd, type);
-  }
-
-  if (is_direct) {
-    out += safe_divide_even_color(direct, color);
-  }
-
-  if (is_indirect) {
-    out += safe_divide_even_color(indirect, color);
-  }
-
-  return out;
-}
-
-ccl_device void kernel_bake_evaluate(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* Setup render buffers. */
-  const int index = offset + x + y * stride;
-  const int pass_stride = kernel_data.film.pass_stride;
-  buffer += index * pass_stride;
-
-  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
-  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
-  ccl_global float *output = buffer + kernel_data.film.pass_combined;
-
-  int seed = __float_as_uint(primitive[0]);
-  int prim = __float_as_uint(primitive[1]);
-  if (prim == -1)
-    return;
-
-  prim += kernel_data.bake.tri_offset;
-
-  /* Random number generator. */
-  uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  float filter_x, filter_y;
-  if (sample == 0) {
-    filter_x = filter_y = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
-  }
-
-  /* Barycentric UV with sub-pixel offset. */
-  float u = primitive[2];
-  float v = primitive[3];
-
-  float dudx = differential[0];
-  float dudy = differential[1];
-  float dvdx = differential[2];
-  float dvdy = differential[3];
-
-  if (sample > 0) {
-    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
-    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
-                                 1.0f - u);
-  }
-
-  /* Shader data setup. */
-  int object = kernel_data.bake.object_index;
-  int shader;
-  float3 P, Ng;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  ShaderData sd;
-  shader_setup_from_sample(
-      kg,
-      &sd,
-      P,
-      Ng,
-      Ng,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      1.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-  sd.I = sd.N;
-
-  /* Setup differentials. */
-  sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
-  sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
-  sd.du.dx = dudx;
-  sd.du.dy = dudy;
-  sd.dv.dx = dvdx;
-  sd.dv.dy = dvdy;
-
-  /* Set RNG state for shaders that use sampling. */
-  PathState state = {0};
-  state.rng_hash = rng_hash;
-  state.rng_offset = 0;
-  state.sample = sample;
-  state.num_samples = num_samples;
-  state.min_ray_pdf = FLT_MAX;
-
-  /* Light passes if we need more than color. */
-  PathRadiance L;
-  int pass_filter = kernel_data.bake.pass_filter;
-
-  if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR)
-    compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
-
-  float3 out = zero_float3();
-
-  ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type;
-  switch (type) {
-    /* data passes */
-    case SHADER_EVAL_NORMAL:
-    case SHADER_EVAL_ROUGHNESS:
-    case SHADER_EVAL_EMISSION: {
-      if (type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) {
-        int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0;
-        shader_eval_surface(kg, &sd, &state, NULL, path_flag);
-      }
-
-      if (type == SHADER_EVAL_NORMAL) {
-        float3 N = sd.N;
-        if (sd.flag & SD_HAS_BUMP) {
-          N = shader_bsdf_average_normal(kg, &sd);
-        }
+#pragma once
 
-        /* encoding: normal = (2 * color) - 1 */
-        out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
-      }
-      else if (type == SHADER_EVAL_ROUGHNESS) {
-        float roughness = shader_bsdf_average_roughness(&sd);
-        out = make_float3(roughness, roughness, roughness);
-      }
-      else {
-        out = shader_emissive_eval(&sd);
-      }
-      break;
-    }
-    case SHADER_EVAL_UV: {
-      out = primitive_uv(kg, &sd);
-      break;
-    }
-#  ifdef __PASSES__
-    /* light passes */
-    case SHADER_EVAL_AO: {
-      out = L.ao;
-      break;
-    }
-    case SHADER_EVAL_COMBINED: {
-      if ((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
-        float alpha;
-        out = path_radiance_clamp_and_sum(kg, &L, &alpha);
-        break;
-      }
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
 
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_DIRECT) == BAKE_FILTER_DIFFUSE_DIRECT)
-        out += L.direct_diffuse;
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_INDIRECT) == BAKE_FILTER_DIFFUSE_INDIRECT)
-        out += L.indirect_diffuse;
+#include "kernel/geom/geom.h"
 
-      if ((pass_filter & BAKE_FILTER_GLOSSY_DIRECT) == BAKE_FILTER_GLOSSY_DIRECT)
-        out += L.direct_glossy;
-      if ((pass_filter & BAKE_FILTER_GLOSSY_INDIRECT) == BAKE_FILTER_GLOSSY_INDIRECT)
-        out += L.indirect_glossy;
-
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_DIRECT) == BAKE_FILTER_TRANSMISSION_DIRECT)
-        out += L.direct_transmission;
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT)
-        out += L.indirect_transmission;
-
-      if ((pass_filter & BAKE_FILTER_EMISSION) != 0)
-        out += L.emission;
-
-      break;
-    }
-    case SHADER_EVAL_SHADOW: {
-      out = L.shadow;
-      break;
-    }
-    case SHADER_EVAL_DIFFUSE: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_diffuse, L.indirect_diffuse, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_GLOSSY: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_glossy, L.indirect_glossy, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_TRANSMISSION: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter);
-      break;
-    }
-#  endif
-
-    /* extra */
-    case SHADER_EVAL_ENVIRONMENT: {
-      /* setup ray */
-      Ray ray;
-
-      ray.P = zero_float3();
-      ray.D = normalize(P);
-      ray.t = 0.0f;
-#  ifdef __CAMERA_MOTION__
-      ray.time = 0.5f;
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-      ray.dD = differential3_zero();
-      ray.dP = differential3_zero();
-#  endif
-
-      /* setup shader data */
-      shader_setup_from_background(kg, &sd, &ray);
-
-      /* evaluate */
-      int path_flag = 0; /* we can't know which type of BSDF this is for */
-      shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-      out = shader_background_eval(&sd);
-      break;
-    }
-    default: {
-      /* no real shader, returning the position of the verts for debugging */
-      out = normalize(P);
-      break;
-    }
-  }
-
-  /* write output */
-  const float4 result = make_float4(out.x, out.y, out.z, 1.0f);
-  kernel_write_pass_float4(output, result);
-}
-
-#endif /* __BAKING__ */
+CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
-                                         ccl_global uint4 *input,
+ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,
+                                         ccl_global const KernelShaderEvalInput *input,
                                          ccl_global float4 *output,
-                                         int i)
+                                         const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
+  /* Setup shader data. */
+  const KernelShaderEvalInput in = input[offset];
 
-  /* setup shader data */
-  int object = in.x;
-  int prim = in.y;
-  float u = __uint_as_float(in.z);
-  float v = __uint_as_float(in.w);
-
-  shader_setup_from_displace(kg, &sd, object, prim, u, v);
+  ShaderData sd;
+  shader_setup_from_displace(kg, &sd, in.object, in.prim, in.u, in.v);
 
-  /* evaluate */
-  float3 P = sd.P;
-  shader_eval_displacement(kg, &sd, &state);
+  /* Evaluate displacement shader. */
+  const float3 P = sd.P;
+  shader_eval_displacement(INTEGRATOR_STATE_PASS_NULL, &sd);
   float3 D = sd.P - P;
 
   object_inverse_dir_transform(kg, &sd, &D);
 
-  /* write output */
-  output[i] += make_float4(D.x, D.y, D.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
 }
 
-ccl_device void kernel_background_evaluate(KernelGlobals *kg,
-                                           ccl_global uint4 *input,
+ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
+                                           ccl_global const KernelShaderEvalInput *input,
                                            ccl_global float4 *output,
-                                           int i)
+                                           const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
-
-  /* setup ray */
-  Ray ray;
-  float u = __uint_as_float(in.x);
-  float v = __uint_as_float(in.y);
-
-  ray.P = zero_float3();
-  ray.D = equirectangular_to_direction(u, v);
-  ray.t = 0.0f;
-#ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#endif
+  /* Setup ray */
+  const KernelShaderEvalInput in = input[offset];
+  const float3 ray_P = zero_float3();
+  const float3 ray_D = equirectangular_to_direction(in.u, in.v);
+  const float ray_time = 0.5f;
 
-#ifdef __RAY_DIFFERENTIALS__
-  ray.dD = differential3_zero();
-  ray.dP = differential3_zero();
-#endif
-
-  /* setup shader data */
-  shader_setup_from_background(kg, &sd, &ray);
+  /* Setup shader data. */
+  ShaderData sd;
+  shader_setup_from_background(kg, &sd, ray_P, ray_D, ray_time);
 
-  /* evaluate */
-  int path_flag = 0; /* we can't know which type of BSDF this is for */
-  shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-  float3 color = shader_background_eval(&sd);
+  /* Evaluate shader.
+   * This is being evaluated for all BSDFs, so path flag does not contain a specific type. */
+  const int path_flag = PATH_RAY_EMISSION;
+  shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+      INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
+  const float3 color = shader_background_eval(&sd);
 
-  /* write output */
-  output[i] += make_float4(color.x, color.y, color.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 1bfac37158d..7be5da8fe6d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_differential.h"
+#include "kernel_lookup_table.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Perspective Camera */
@@ -39,7 +46,7 @@ ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u
   return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg,
+ccl_device void camera_sample_perspective(const KernelGlobals *ccl_restrict kg,
                                           float raster_x,
                                           float raster_y,
                                           float lens_u,
@@ -113,10 +120,14 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     float3 Dcenter = transform_direction(&cameratoworld, Pcamera);
-
-    ray->dP = differential3_zero();
-    ray->dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - normalize(Dcenter);
-    ray->dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - normalize(Dcenter);
+    float3 Dcenter_normalized = normalize(Dcenter);
+
+    /* TODO: can this be optimized to give compact differentials directly? */
+    ray->dP = differential_zero_compact();
+    differential3 dD;
+    dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - Dcenter_normalized;
+    dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - Dcenter_normalized;
+    ray->dD = differential_make_compact(dD);
 #endif
   }
   else {
@@ -143,8 +154,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dx = normalize(transform_direction(&cameratoworld, Dx));
     spherical_stereo_transform(&kernel_data.cam, &Px, &Dx);
 
-    ray->dP.dx = Px - Pcenter;
-    ray->dD.dx = Dx - Dcenter;
+    differential3 dP, dD;
+
+    dP.dx = Px - Pcenter;
+    dD.dx = Dx - Dcenter;
 
     float3 Py = Pnostereo;
     float3 Dy = transform_perspective(&rastertocamera,
@@ -152,8 +165,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dy = normalize(transform_direction(&cameratoworld, Dy));
     spherical_stereo_transform(&kernel_data.cam, &Py, &Dy);
 
-    ray->dP.dy = Py - Pcenter;
-    ray->dD.dy = Dy - Dcenter;
+    dP.dy = Py - Pcenter;
+    dD.dy = Dy - Dcenter;
+    ray->dD = differential_make_compact(dD);
+    ray->dP = differential_make_compact(dP);
 #endif
   }
 
@@ -162,8 +177,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
   float z_inv = 1.0f / normalize(Pcamera).z;
   float nearclip = kernel_data.cam.nearclip * z_inv;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = kernel_data.cam.cliplength * z_inv;
 #else
   ray->t = FLT_MAX;
@@ -171,7 +185,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 }
 
 /* Orthographic Camera */
-ccl_device void camera_sample_orthographic(KernelGlobals *kg,
+ccl_device void camera_sample_orthographic(const KernelGlobals *ccl_restrict kg,
                                            float raster_x,
                                            float raster_y,
                                            float lens_u,
@@ -220,10 +234,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
   /* ray differential */
-  ray->dP.dx = float4_to_float3(kernel_data.cam.dx);
-  ray->dP.dy = float4_to_float3(kernel_data.cam.dy);
+  differential3 dP;
+  dP.dx = float4_to_float3(kernel_data.cam.dx);
+  dP.dy = float4_to_float3(kernel_data.cam.dx);
 
-  ray->dD = differential3_zero();
+  ray->dP = differential_make_compact(dP);
+  ray->dD = differential_zero_compact();
 #endif
 
 #ifdef __CAMERA_CLIPPING__
@@ -323,8 +339,9 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Px, &Dx);
   }
 
-  ray->dP.dx = Px - Pcenter;
-  ray->dD.dx = Dx - Dcenter;
+  differential3 dP, dD;
+  dP.dx = Px - Pcenter;
+  dD.dx = Dx - Dcenter;
 
   float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
   float3 Dy = panorama_to_direction(cam, Py.x, Py.y);
@@ -334,16 +351,17 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Py, &Dy);
   }
 
-  ray->dP.dy = Py - Pcenter;
-  ray->dD.dy = Dy - Dcenter;
+  dP.dy = Py - Pcenter;
+  dD.dy = Dy - Dcenter;
+  ray->dD = differential_make_compact(dD);
+  ray->dP = differential_make_compact(dP);
 #endif
 
 #ifdef __CAMERA_CLIPPING__
   /* clipping */
   float nearclip = cam->nearclip;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = cam->cliplength;
 #else
   ray->t = FLT_MAX;
@@ -352,7 +370,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
 
 /* Common */
 
-ccl_device_inline void camera_sample(KernelGlobals *kg,
+ccl_device_inline void camera_sample(const KernelGlobals *ccl_restrict kg,
                                      int x,
                                      int y,
                                      float filter_u,
@@ -426,13 +444,13 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
 
 /* Utilities */
 
-ccl_device_inline float3 camera_position(KernelGlobals *kg)
+ccl_device_inline float3 camera_position(const KernelGlobals *kg)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   return make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
 }
 
-ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_distance(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
@@ -446,7 +464,7 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_z_depth(const KernelGlobals *kg, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     Transform worldtocamera = kernel_data.cam.worldtocamera;
@@ -459,7 +477,7 @@ ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
+ccl_device_inline float3 camera_direction_from_point(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
 
@@ -473,7 +491,7 @@ ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P
   }
 }
 
-ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
+ccl_device_inline float3 camera_world_to_ndc(const KernelGlobals *kg, ShaderData *sd, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     /* perspective / ortho */
diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h
index 5eb1bdad02e..960774e0741 100644
--- a/intern/cycles/kernel/kernel_color.h
+++ b/intern/cycles/kernel/kernel_color.h
@@ -14,25 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COLOR_H__
-#define __KERNEL_COLOR_H__
+#pragma once
 
 #include "util/util_color.h"
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 xyz_to_rgb(KernelGlobals *kg, float3 xyz)
+ccl_device float3 xyz_to_rgb(const KernelGlobals *kg, float3 xyz)
 {
   return make_float3(dot(float4_to_float3(kernel_data.film.xyz_to_r), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_g), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_b), xyz));
 }
 
-ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c)
+ccl_device float linear_rgb_to_gray(const KernelGlobals *kg, float3 c)
 {
   return dot(c, float4_to_float3(kernel_data.film.rgb_to_y));
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COLOR_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
deleted file mode 100644
index 4a9304a134c..00000000000
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_COMPAT_OPENCL_H__
-#define __KERNEL_COMPAT_OPENCL_H__
-
-#define __KERNEL_GPU__
-#define __KERNEL_OPENCL__
-
-/* no namespaces in opencl */
-#define CCL_NAMESPACE_BEGIN
-#define CCL_NAMESPACE_END
-
-#ifdef __CL_NOINLINE__
-#  define ccl_noinline __attribute__((noinline))
-#else
-#  define ccl_noinline
-#endif
-
-/* in opencl all functions are device functions, so leave this empty */
-#define ccl_device
-#define ccl_device_inline ccl_device
-#define ccl_device_forceinline ccl_device
-#define ccl_device_noinline ccl_device ccl_noinline
-#define ccl_device_noinline_cpu ccl_device
-#define ccl_may_alias
-#define ccl_static_constant static __constant
-#define ccl_constant __constant
-#define ccl_global __global
-#define ccl_local __local
-#define ccl_local_param __local
-#define ccl_private __private
-#define ccl_restrict restrict
-#define ccl_ref
-#define ccl_align(n) __attribute__((aligned(n)))
-#define ccl_optional_struct_init
-
-#if __OPENCL_VERSION__ >= 200 && !defined(__NV_CL_C_VERSION)
-#  define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1)))
-#else
-#  define ccl_loop_no_unroll
-#endif
-
-#ifdef __SPLIT_KERNEL__
-#  define ccl_addr_space __global
-#else
-#  define ccl_addr_space
-#endif
-
-#define ATTR_FALLTHROUGH
-
-#define ccl_local_id(d) get_local_id(d)
-#define ccl_global_id(d) get_global_id(d)
-
-#define ccl_local_size(d) get_local_size(d)
-#define ccl_global_size(d) get_global_size(d)
-
-#define ccl_group_id(d) get_group_id(d)
-#define ccl_num_groups(d) get_num_groups(d)
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
-/* no assert in opencl */
-#define kernel_assert(cond)
-
-/* make_type definitions with opencl style element initializers */
-#ifdef make_float2
-#  undef make_float2
-#endif
-#ifdef make_float3
-#  undef make_float3
-#endif
-#ifdef make_float4
-#  undef make_float4
-#endif
-#ifdef make_int2
-#  undef make_int2
-#endif
-#ifdef make_int3
-#  undef make_int3
-#endif
-#ifdef make_int4
-#  undef make_int4
-#endif
-#ifdef make_uchar4
-#  undef make_uchar4
-#endif
-
-#define make_float2(x, y) ((float2)(x, y))
-#define make_float3(x, y, z) ((float3)(x, y, z))
-#define make_float4(x, y, z, w) ((float4)(x, y, z, w))
-#define make_int2(x, y) ((int2)(x, y))
-#define make_int3(x, y, z) ((int3)(x, y, z))
-#define make_int4(x, y, z, w) ((int4)(x, y, z, w))
-#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w))
-
-/* math functions */
-#define __uint_as_float(x) as_float(x)
-#define __float_as_uint(x) as_uint(x)
-#define __int_as_float(x) as_float(x)
-#define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)(x)), ((float)(y)))
-#define fabsf(x) fabs(((float)(x)))
-#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
-#define asinf(x) asin(((float)(x)))
-#define acosf(x) acos(((float)(x)))
-#define atanf(x) atan(((float)(x)))
-#define floorf(x) floor(((float)(x)))
-#define ceilf(x) ceil(((float)(x)))
-#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
-#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
-#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
-#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
-#define fmodf(x, y) fmod((float)(x), (float)(y))
-#define sinhf(x) sinh(((float)(x)))
-#define coshf(x) cosh(((float)(x)))
-#define tanhf(x) tanh(((float)(x)))
-
-/* Use native functions with possibly lower precision for performance,
- * no issues found so far. */
-#if 1
-#  define sinf(x) native_sin(((float)(x)))
-#  define cosf(x) native_cos(((float)(x)))
-#  define tanf(x) native_tan(((float)(x)))
-#  define expf(x) native_exp(((float)(x)))
-#  define sqrtf(x) native_sqrt(((float)(x)))
-#  define logf(x) native_log(((float)(x)))
-#  define rcp(x) native_recip(x)
-#else
-#  define sinf(x) sin(((float)(x)))
-#  define cosf(x) cos(((float)(x)))
-#  define tanf(x) tan(((float)(x)))
-#  define expf(x) exp(((float)(x)))
-#  define sqrtf(x) sqrt(((float)(x)))
-#  define logf(x) log(((float)(x)))
-#  define rcp(x) recip(x)
-#endif
-
-/* data lookup defines */
-#define kernel_data (*kg->data)
-#define kernel_tex_array(tex) \
-  ((const ccl_global tex##_t *)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))
-#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)]
-
-/* define NULL */
-#ifndef NULL
-#  define NULL ((void *)0)
-#endif
-
-/* enable extensions */
-#ifdef __KERNEL_CL_KHR_FP16__
-#  pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-#include "util/util_half.h"
-#include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPENCL_H__ */
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index 3ec0cdbaccc..db4e110bd10 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -14,26 +14,28 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(ccl_addr_space differential3 *dP_,
-                                      const differential3 dP,
-                                      float3 D,
-                                      const differential3 dD,
-                                      float3 Ng,
-                                      float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *surface_dP,
+                                      const differential3 ray_dP,
+                                      float3 ray_D,
+                                      const differential3 ray_dD,
+                                      float3 surface_Ng,
+                                      float ray_t)
 {
   /* ray differential transfer through homogeneous medium, to
    * compute dPdx/dy at a shading point from the incoming ray */
 
-  float3 tmp = D / dot(D, Ng);
-  float3 tmpx = dP.dx + t * dD.dx;
-  float3 tmpy = dP.dy + t * dD.dy;
+  float3 tmp = ray_D / dot(ray_D, surface_Ng);
+  float3 tmpx = ray_dP.dx + ray_t * ray_dD.dx;
+  float3 tmpy = ray_dP.dy + ray_t * ray_dD.dy;
 
-  dP_->dx = tmpx - dot(tmpx, Ng) * tmp;
-  dP_->dy = tmpy - dot(tmpy, Ng) * tmp;
+  surface_dP->dx = tmpx - dot(tmpx, surface_Ng) * tmp;
+  surface_dP->dy = tmpy - dot(tmpy, surface_Ng) * tmp;
 }
 
 ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
@@ -112,4 +114,53 @@ ccl_device differential3 differential3_zero()
   return d;
 }
 
+/* Compact ray differentials that are just a scale to reduce memory usage and
+ * access cost in GPU.
+ *
+ * See above for more accurate reference implementations.
+ *
+ * TODO: also store the more compact version in ShaderData and recompute where
+ * needed? */
+
+ccl_device_forceinline float differential_zero_compact()
+{
+  return 0.0f;
+}
+
+ccl_device_forceinline float differential_make_compact(const differential3 D)
+{
+  return 0.5f * (len(D.dx) + len(D.dy));
+}
+
+ccl_device_forceinline void differential_transfer_compact(ccl_addr_space differential3 *surface_dP,
+                                                          const float ray_dP,
+                                                          const float3 /* ray_D */,
+                                                          const float ray_dD,
+                                                          const float3 surface_Ng,
+                                                          const float ray_t)
+{
+  /* ray differential transfer through homogeneous medium, to
+   * compute dPdx/dy at a shading point from the incoming ray */
+  float scale = ray_dP + ray_t * ray_dD;
+
+  float3 dx, dy;
+  make_orthonormals(surface_Ng, &dx, &dy);
+  surface_dP->dx = dx * scale;
+  surface_dP->dy = dy * scale;
+}
+
+ccl_device_forceinline void differential_incoming_compact(ccl_addr_space differential3 *dI,
+                                                          const float3 D,
+                                                          const float dD)
+{
+  /* compute dIdx/dy at a shading point, we just need to negate the
+   * differential of the ray direction */
+
+  float3 dx, dy;
+  make_orthonormals(D, &dx, &dy);
+
+  dI->dx = dD * dx;
+  dI->dy = dD * dy;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index aebf2ec8e28..d62285d173d 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -14,40 +14,36 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* Direction Emission */
-ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    LightSample *ls,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 I,
-                                                    differential3 dI,
-                                                    float t,
-                                                    float time)
+/* Evaluate shader on light. */
+ccl_device_noinline_cpu float3 light_sample_shader_eval(INTEGRATOR_STATE_ARGS,
+                                                        ShaderData *ccl_restrict emission_sd,
+                                                        LightSample *ccl_restrict ls,
+                                                        float time)
 {
   /* setup shading at emitter */
   float3 eval = zero_float3();
 
   if (shader_constant_emission_eval(kg, ls->shader, &eval)) {
-    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
+    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, ls->D) > 0.0f) {
       ls->Ng = -ls->Ng;
     }
   }
   else {
     /* Setup shader data and call shader_eval_surface once, better
      * for GPU coherence and compile times. */
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
 #ifdef __BACKGROUND_MIS__
     if (ls->type == LIGHT_BACKGROUND) {
-      Ray ray;
-      ray.D = ls->D;
-      ray.P = ls->P;
-      ray.t = 1.0f;
-      ray.time = time;
-      ray.dP = differential3_zero();
-      ray.dD = dI;
-
-      shader_setup_from_background(kg, emission_sd, &ray);
+      shader_setup_from_background(kg, emission_sd, ls->P, ls->D, time);
     }
     else
 #endif
@@ -56,13 +52,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
                                emission_sd,
                                ls->P,
                                ls->Ng,
-                               I,
+                               -ls->D,
                                ls->shader,
                                ls->object,
                                ls->prim,
                                ls->u,
                                ls->v,
-                               t,
+                               ls->t,
                                time,
                                false,
                                ls->lamp);
@@ -70,11 +66,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
       ls->Ng = emission_sd->Ng;
     }
 
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+
     /* No proper path flag, we're evaluating this for all closures. that's
      * weak but we'd have to do multiple evaluations otherwise. */
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, NULL, PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, NULL, PATH_RAY_EMISSION);
 
     /* Evaluate closures. */
 #ifdef __BACKGROUND_MIS__
@@ -98,85 +96,129 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
   return eval;
 }
 
-ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
-                                             ShaderData *sd,
-                                             ShaderData *emission_sd,
-                                             LightSample *ls,
-                                             ccl_addr_space PathState *state,
-                                             Ray *ray,
-                                             BsdfEval *eval,
-                                             bool *is_lamp,
-                                             float rand_terminate)
+/* Test if light sample is from a light or emission from geometry. */
+ccl_device_inline bool light_sample_is_light(const LightSample *ccl_restrict ls)
 {
-  if (ls->pdf == 0.0f)
-    return false;
-
-  /* todo: implement */
-  differential3 dD = differential3_zero();
+  /* return if it's a lamp for shadow pass */
+  return (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
+}
 
-  /* evaluate closure */
+/* Early path termination of shadow rays. */
+ccl_device_inline bool light_sample_terminate(const KernelGlobals *ccl_restrict kg,
+                                              const LightSample *ccl_restrict ls,
+                                              BsdfEval *ccl_restrict eval,
+                                              const float rand_terminate)
+{
+  if (bsdf_eval_is_zero(eval)) {
+    return true;
+  }
 
-  float3 light_eval = direct_emissive_eval(
-      kg, emission_sd, ls, state, -ls->D, dD, ls->t, sd->time);
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    float probability = max3(fabs(bsdf_eval_sum(eval))) *
+                        kernel_data.integrator.light_inv_rr_threshold;
+    if (probability < 1.0f) {
+      if (rand_terminate >= probability) {
+        return true;
+      }
+      bsdf_eval_mul(eval, 1.0f / probability);
+    }
+  }
 
-  if (is_zero(light_eval))
-    return false;
+  return false;
+}
 
-    /* evaluate BSDF at shading point */
+/* This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. The algorithm slightly distorts flat surface
+ * of a triangle. Surface is lifted by amount h along normal n in the incident
+ * point. */
 
-#ifdef __VOLUME__
-  if (sd->prim != PRIM_NONE)
-    shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
+ccl_device_inline float3 shadow_ray_smooth_surface_offset(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          float3 Ng)
+{
+  float3 V[3], N[3];
+  triangle_vertices_and_normals(kg, sd->prim, V, N);
+
+  const float u = sd->u, v = sd->v;
+  const float w = 1 - u - v;
+  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
+  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
+
+  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
+
+  /* Parabolic approximation */
+  float a = dot(N[2] - N[0], V[0] - V[2]);
+  float b = dot(N[2] - N[1], V[1] - V[2]);
+  float c = dot(N[1] - N[0], V[1] - V[0]);
+  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
+
+  /* Check flipped normals */
+  if (dot(n, Ng) > 0) {
+    /* Local linear envelope */
+    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
+    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
+    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
+    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
+    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
+    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
+    h = max(min(min(h0, h1), h2), h * 0.5f);
+  }
   else {
-    float bsdf_pdf;
-    shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
-    if (ls->shader & SHADER_USE_MIS) {
-      /* Multiple importance sampling. */
-      float mis_weight = power_heuristic(ls->pdf, bsdf_pdf);
-      light_eval *= mis_weight;
-    }
+    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
+    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
+    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
+    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
+    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
+    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
+    h = min(-min(min(h0, h1), h2), h * 0.5f);
   }
-#else
-  shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
-#endif
 
-  bsdf_eval_mul3(eval, light_eval / ls->pdf);
-
-#ifdef __PASSES__
-  /* use visibility flag to skip lights */
-  if (ls->shader & SHADER_EXCLUDE_ANY) {
-    if (ls->shader & SHADER_EXCLUDE_DIFFUSE)
-      eval->diffuse = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_GLOSSY)
-      eval->glossy = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_TRANSMIT)
-      eval->transmission = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_SCATTER)
-      eval->volume = zero_float3();
-  }
-#endif
+  return n * h;
+}
 
-  if (bsdf_eval_is_zero(eval))
-    return false;
+/* Ray offset to avoid shadow terminator artifact. */
 
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f
-#ifdef __SHADOW_TRICKS__
-      && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
-#endif
-  ) {
-    float probability = max3(fabs(bsdf_eval_sum(eval))) *
-                        kernel_data.integrator.light_inv_rr_threshold;
-    if (probability < 1.0f) {
-      if (rand_terminate >= probability) {
-        return false;
+ccl_device_inline float3 shadow_ray_offset(const KernelGlobals *ccl_restrict kg,
+                                           const ShaderData *ccl_restrict sd,
+                                           float3 L)
+{
+  float NL = dot(sd->N, L);
+  bool transmit = (NL < 0.0f);
+  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
+  float3 P = ray_offset(sd->P, Ng);
+
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
+    const float offset_cutoff =
+        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
+    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
+     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
+     * make a smooth transition near the threshold. */
+    if (offset_cutoff > 0.0f) {
+      float NgL = dot(Ng, L);
+      float offset_amount = 0.0f;
+      if (NL < offset_cutoff) {
+        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
+      }
+      else {
+        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
+      }
+      if (offset_amount > 0.0f) {
+        P += shadow_ray_smooth_surface_offset(kg, sd, Ng) * offset_amount;
       }
-      bsdf_eval_mul(eval, 1.0f / probability);
     }
   }
 
+  return P;
+}
+
+ccl_device_inline void shadow_ray_setup(const ShaderData *ccl_restrict sd,
+                                        const LightSample *ccl_restrict ls,
+                                        const float3 P,
+                                        Ray *ray)
+{
   if (ls->shader & SHADER_CAST_SHADOW) {
     /* setup ray */
-    ray->P = ray_offset_shadow(kg, sd, ls->D);
+    ray->P = P;
 
     if (ls->t == FLT_MAX) {
       /* distant light */
@@ -185,160 +227,40 @@ ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
     }
     else {
       /* other lights, avoid self-intersection */
-      ray->D = ray_offset(ls->P, ls->Ng) - ray->P;
+      ray->D = ray_offset(ls->P, ls->Ng) - P;
       ray->D = normalize_len(ray->D, &ray->t);
     }
-
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
   }
   else {
     /* signal to not cast shadow ray */
+    ray->P = zero_float3();
+    ray->D = zero_float3();
     ray->t = 0.0f;
   }
 
-  /* return if it's a lamp for shadow pass */
-  *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
-
-  return true;
+  ray->dP = differential_make_compact(sd->dP);
+  ray->dD = differential_zero_compact();
+  ray->time = sd->time;
 }
 
-/* Indirect Primitive Emission */
-
-ccl_device_noinline_cpu float3 indirect_primitive_emission(
-    KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_surface_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const LightSample *ccl_restrict ls,
+                                                          Ray *ray)
 {
-  /* evaluate emissive closure */
-  float3 L = shader_emissive_eval(sd);
-
-#ifdef __HAIR__
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
-      (sd->type & PRIMITIVE_ALL_TRIANGLE))
-#else
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
-#endif
-  {
-    /* multiple importance sampling, get triangle light pdf,
-     * and compute weight with respect to BSDF pdf */
-    float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
-    return L * mis_weight;
-  }
-
-  return L;
+  const float3 P = shadow_ray_offset(kg, sd, ls->D);
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
-/* Indirect Lamp Emission */
-
-ccl_device_noinline_cpu void indirect_lamp_emission(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadiance *L,
-                                                    Ray *ray,
-                                                    float3 throughput)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_volume_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                         const ShaderData *ccl_restrict sd,
+                                                         const LightSample *ccl_restrict ls,
+                                                         const float3 P,
+                                                         Ray *ray)
 {
-  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
-    LightSample ls ccl_optional_struct_init;
-
-    if (!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
-      continue;
-
-#ifdef __PASSES__
-    /* use visibility flag to skip lights */
-    if (ls.shader & SHADER_EXCLUDE_ANY) {
-      if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-          ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
-           ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-            (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-          ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-          ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-        continue;
-    }
-#endif
-
-    float3 lamp_L = direct_emissive_eval(
-        kg, emission_sd, &ls, state, -ray->D, ray->dD, ls.t, ray->time);
-
-#ifdef __VOLUME__
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      /* shadow attenuation */
-      Ray volume_ray = *ray;
-      volume_ray.t = ls.t;
-      float3 volume_tp = one_float3();
-      kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp);
-      lamp_L *= volume_tp;
-    }
-#endif
-
-    if (!(state->flag & PATH_RAY_MIS_SKIP)) {
-      /* multiple importance sampling, get regular light pdf,
-       * and compute weight with respect to BSDF pdf */
-      float mis_weight = power_heuristic(state->ray_pdf, ls.pdf);
-      lamp_L *= mis_weight;
-    }
-
-    path_radiance_accum_emission(kg, L, state, throughput, lamp_L);
-  }
-}
-
-/* Indirect Background */
-
-ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
-                                                   ShaderData *emission_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_global float *buffer,
-                                                   ccl_addr_space Ray *ray)
-{
-#ifdef __BACKGROUND__
-  int shader = kernel_data.background.surface_shader;
-
-  /* Use visibility flag to skip lights. */
-  if (shader & SHADER_EXCLUDE_ANY) {
-    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-        ((shader & SHADER_EXCLUDE_GLOSSY) &&
-         ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-        ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-        ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
-        ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-      return zero_float3();
-  }
-
-  /* Evaluate background shader. */
-  float3 L = zero_float3();
-  if (!shader_constant_emission_eval(kg, shader, &L)) {
-#  ifdef __SPLIT_KERNEL__
-    Ray priv_ray = *ray;
-    shader_setup_from_background(kg, emission_sd, &priv_ray);
-#  else
-    shader_setup_from_background(kg, emission_sd, ray);
-#  endif
-
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, buffer, state->flag | PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
-
-    L = shader_background_eval(emission_sd);
-  }
-
-  /* Background MIS weights. */
-#  ifdef __BACKGROUND_MIS__
-  /* Check if background light exists or if we should skip pdf. */
-  if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
-    /* multiple importance sampling, get background light pdf for ray
-     * direction, and compute weight with respect to BSDF pdf */
-    float pdf = background_light_pdf(kg, ray->P, ray->D);
-    float mis_weight = power_heuristic(state->ray_pdf, pdf);
-
-    return L * mis_weight;
-  }
-#  endif
-
-  return L;
-#else
-  return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index a6fd4f1dc7e..fa93f4830d1 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -14,119 +14,516 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 film_get_pass_result(KernelGlobals *kg,
-                                       ccl_global float *buffer,
-                                       float sample_scale,
-                                       int index,
-                                       bool use_display_sample_scale)
-{
-  float4 pass_result;
-
-  int display_pass_stride = kernel_data.film.display_pass_stride;
-  int display_pass_components = kernel_data.film.display_pass_components;
-
-  if (display_pass_components == 4) {
-    float4 in = *(ccl_global float4 *)(buffer + display_pass_stride +
-                                       index * kernel_data.film.pass_stride);
-    float alpha = use_display_sample_scale ?
-                      (kernel_data.film.use_display_pass_alpha ? in.w : 1.0f / sample_scale) :
-                      1.0f;
-
-    pass_result = make_float4(in.x, in.y, in.z, alpha);
-
-    int display_divide_pass_stride = kernel_data.film.display_divide_pass_stride;
-    if (display_divide_pass_stride != -1) {
-      ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride +
-                                                           index * kernel_data.film.pass_stride);
-      float3 divided = safe_divide_even_color(float4_to_float3(pass_result),
-                                              float4_to_float3(*divide_in));
-      pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w);
-    }
+/* --------------------------------------------------------------------
+ * Common utilities.
+ */
 
-    if (kernel_data.film.use_display_exposure) {
-      float exposure = kernel_data.film.exposure;
-      pass_result *= make_float4(exposure, exposure, exposure, 1.0f);
-    }
+/* The input buffer contains transparency = 1 - alpha, this converts it to
+ * alpha. Also clamp since alpha might end up outside of 0..1 due to Russian
+ * roulette. */
+ccl_device_forceinline float film_transparency_to_alpha(float transparency)
+{
+  return saturate(1.0f - transparency);
+}
+
+ccl_device_inline float film_get_scale(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+    return 1.0f / sample_count;
+  }
+
+  return 1.0f;
+}
+
+ccl_device_inline float film_get_scale_exposure(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale_exposure;
+  }
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  if (kfilm_convert->pass_use_exposure) {
+    return scale * kfilm_convert->exposure;
+  }
+
+  return scale;
+}
+
+ccl_device_inline bool film_get_scale_and_scale_exposure(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict scale,
+    float *ccl_restrict scale_exposure)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    *scale = kfilm_convert->scale;
+    *scale_exposure = kfilm_convert->scale_exposure;
+    return true;
+  }
+
+  const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+  if (!sample_count) {
+    *scale = 0.0f;
+    *scale_exposure = 0.0f;
+    return false;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    *scale = 1.0f / sample_count;
   }
-  else if (display_pass_components == 1) {
-    ccl_global float *in = (ccl_global float *)(buffer + display_pass_stride +
-                                                index * kernel_data.film.pass_stride);
-    pass_result = make_float4(*in, *in, *in, 1.0f / sample_scale);
+  else {
+    *scale = 1.0f;
+  }
+
+  if (kfilm_convert->pass_use_exposure) {
+    *scale_exposure = *scale * kfilm_convert->exposure;
+  }
+  else {
+    *scale_exposure = *scale;
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Float (scalar) passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_depth(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+}
+
+ccl_device_inline void film_get_pass_pixel_mist(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer,
+                                                float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  /* Note that we accumulate 1 - mist in the kernel to avoid having to
+   * track the mist values in the integrator state. */
+  pixel[0] = saturate(1.0f - f * scale_exposure);
+}
+
+ccl_device_inline void film_get_pass_pixel_sample_count(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see
+   * meaningful value when adaptive sampler stopped rendering image way before the maximum
+   * number of samples was reached (for examples when number of samples is set to 0 in
+   * viewport). */
+
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = __float_as_uint(f) * kfilm_convert->scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = f * scale_exposure;
+}
+
+/* --------------------------------------------------------------------
+ * Float 3 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_light_path(const KernelFilmConvert *ccl_restrict
+                                                          kfilm_convert,
+                                                      ccl_global const float *ccl_restrict buffer,
+                                                      float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  /* Read light pass. */
+  const float *in = buffer + kfilm_convert->pass_offset;
+  float3 f = make_float3(in[0], in[1], in[2]);
+
+  /* Optionally add indirect light pass. */
+  if (kfilm_convert->pass_indirect != PASS_UNUSED) {
+    const float *in_indirect = buffer + kfilm_convert->pass_indirect;
+    const float3 f_indirect = make_float3(in_indirect[0], in_indirect[1], in_indirect[2]);
+    f += f_indirect;
+  }
+
+  /* Optionally divide out color. */
+  if (kfilm_convert->pass_divide != PASS_UNUSED) {
+    const float *in_divide = buffer + kfilm_convert->pass_divide;
+    const float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
+    f = safe_divide_even_color(f, f_divide);
+
+    /* Exposure only, sample scale cancels out. */
+    f *= kfilm_convert->exposure;
+  }
+  else {
+    /* Sample scale and exposure. */
+    f *= film_get_scale_exposure(kfilm_convert, buffer);
+  }
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_float3(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 f = make_float3(in[0], in[1], in[2]) * scale_exposure;
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+/* --------------------------------------------------------------------
+ * Float4 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_motion(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_motion_weight != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float *in_weight = buffer + kfilm_convert->pass_motion_weight;
+
+  const float weight = in_weight[0];
+  const float weight_inv = (weight > 0.0f) ? 1.0f / weight : 0.0f;
+
+  const float4 motion = make_float4(in[0], in[1], in[2], in[3]) * weight_inv;
+
+  pixel[0] = motion.x;
+  pixel[1] = motion.y;
+  pixel[2] = motion.z;
+  pixel[3] = motion.w;
+}
+
+ccl_device_inline void film_get_pass_pixel_cryptomatte(const KernelFilmConvert *ccl_restrict
+                                                           kfilm_convert,
+                                                       ccl_global const float *ccl_restrict buffer,
+                                                       float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
+  /* x and z contain integer IDs, don't rescale them.
+   * y and w contain matte weights, they get scaled. */
+  pixel[0] = f.x;
+  pixel[1] = f.y * scale;
+  pixel[2] = f.z;
+  pixel[3] = f.w * scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float4(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = alpha;
+}
+
+ccl_device_inline void film_get_pass_pixel_combined(const KernelFilmConvert *ccl_restrict
+                                                        kfilm_convert,
+                                                    ccl_global const float *ccl_restrict buffer,
+                                                    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+
+  /* 3rd channel contains transparency = 1 - alpha for the combined pass. */
+
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    pixel[0] = 0.0f;
+    pixel[1] = 0.0f;
+    pixel[2] = 0.0f;
+    pixel[3] = 0.0f;
+    return;
   }
 
-  return pass_result;
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = film_transparency_to_alpha(alpha);
 }
 
-ccl_device float4 film_map(KernelGlobals *kg, float4 rgba_in, float scale)
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_device_inline float3
+film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
 {
-  float4 result;
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
 
-  /* Conversion to SRGB. */
-  result.x = color_linear_to_srgb(rgba_in.x * scale);
-  result.y = color_linear_to_srgb(rgba_in.y * scale);
-  result.z = color_linear_to_srgb(rgba_in.z * scale);
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
 
-  /* Clamp since alpha might be > 1.0 due to Russian roulette. */
-  result.w = saturate(rgba_in.w * scale);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
 
-  return result;
+  const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure;
+
+  return pixel;
 }
 
-ccl_device uchar4 film_float_to_byte(float4 color)
+ccl_device_inline float3 safe_divide_shadow_catcher(float3 a, float3 b)
 {
-  uchar4 result;
+  float x, y, z;
 
-  /* simple float to byte conversion */
-  result.x = (uchar)(saturate(color.x) * 255.0f);
-  result.y = (uchar)(saturate(color.y) * 255.0f);
-  result.z = (uchar)(saturate(color.z) * 255.0f);
-  result.w = (uchar)(saturate(color.w) * 255.0f);
+  x = (b.x != 0.0f) ? a.x / b.x : 1.0f;
+  y = (b.y != 0.0f) ? a.y / b.y : 1.0f;
+  z = (b.z != 0.0f) ? a.z / b.z : 1.0f;
 
-  return result;
+  return make_float3(x, y, z);
 }
 
-ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg,
-                                            ccl_global uchar4 *rgba,
-                                            ccl_global float *buffer,
-                                            float sample_scale,
-                                            int x,
-                                            int y,
-                                            int offset,
-                                            int stride)
+ccl_device_inline float3
+film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                              ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* For the shadow catcher pass we divide combined pass by the shadow catcher.
+   * Note that denoised shadow catcher pass contains value which only needs ot be scaled (but not
+   * to be calculated as division). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  if (kfilm_convert->is_denoised) {
+    return film_calculate_shadow_catcher_denoised(kfilm_convert, buffer);
+  }
 
-  /* map colors */
-  float4 float_result = film_map(kg, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
-  uchar4 uchar_result = film_float_to_byte(float_result);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_sample_count != PASS_UNUSED);
 
-  rgba += index;
-  *rgba = uchar_result;
+  /* If there is no shadow catcher object in this pixel, there is no modification of the light
+   * needed, so return one. */
+  ccl_global const float *in_catcher_sample_count =
+      buffer + kfilm_convert->pass_shadow_catcher_sample_count;
+  const float num_samples = in_catcher_sample_count[0];
+  if (num_samples == 0.0f) {
+    return one_float3();
+  }
+
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
+
+  /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
+   * shadow catcher objects in the scene. In this case there will be no auxillary passes required
+   * for the devision (to save up memory). So delay the asserts to this point so that the number of
+   * samples check handles such configuration. */
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  /* No scaling needed. The integration works in way that number of samples in the combined and
+   * shadow catcher passes are the same, and exposure is cancelled during the division. */
+  const float3 color_catcher = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]);
+  const float3 color_combined = make_float3(in_combined[0], in_combined[1], in_combined[2]);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]);
+
+  /* Need to ignore contribution of the matte object when doing division (otherwise there will be
+   * artifacts caused by anti-aliasing). Since combined pass is used for adaptive sampling and need
+   * to contain matte objects, we subtrack matte objects contribution here. This is the same as if
+   * the matte objects were not accumulated to the combined pass. */
+  const float3 combined_no_matte = color_combined - color_matte;
+
+  const float3 shadow_catcher = safe_divide_shadow_catcher(combined_no_matte, color_catcher);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+  const float transparency = in_combined[3] * scale;
+  const float alpha = film_transparency_to_alpha(transparency);
+
+  /* Alpha-over on white using transparency of the combined pass. This allows to eliminate
+   * artifacts which are happenning on an edge of a shadow catcher when using transparent film.
+   * Note that we treat shadow catcher as straight alpha here because alpha got cancelled out
+   * during the division. */
+  const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher;
+
+  return pixel;
 }
 
-ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
-                                                  ccl_global uchar4 *rgba,
-                                                  ccl_global float *buffer,
-                                                  float sample_scale,
-                                                  int x,
-                                                  int y,
-                                                  int offset,
-                                                  int stride)
+ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* The approximation of the shadow is 1 - average(shadow_catcher_pass). A better approximation
+   * is possible.
+   *
+   * The matte is alpha-overed onto the shadow (which is kind of alpha-overing shadow onto footage,
+   * and then alpha-overing synthetic objects on top). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure;
+
+  const float transparency = in_matte[3] * scale;
+  const float alpha = saturate(1.0f - transparency);
+
+  const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;
+
+  if (kfilm_convert->use_approximate_shadow_catcher_background) {
+    kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
+
+    ccl_global const float *in_background = buffer + kfilm_convert->pass_background;
+    const float3 color_background = make_float3(
+                                        in_background[0], in_background[1], in_background[2]) *
+                                    scale_exposure;
+    const float3 alpha_over = color_matte + color_background * (1.0f - alpha_matte);
+    return make_float4(alpha_over.x, alpha_over.y, alpha_over.z, 1.0f);
+  }
 
-  ccl_global half *out = (ccl_global half *)rgba + index * 4;
-  float4_store_half(out, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
+  return make_float4(color_matte.x, color_matte.y, color_matte.z, alpha_matte);
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+
+  const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 3 || kfilm_convert->num_components == 4);
+
+  const float4 pixel_value = film_calculate_shadow_catcher_matte_with_shadow(kfilm_convert,
+                                                                             buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+  if (kfilm_convert->num_components == 4) {
+    pixel[3] = pixel_value.w;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Compositing and overlays.
+ */
+
+ccl_device_inline void film_apply_pass_pixel_overlays_rgba(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  if (kfilm_convert->show_active_pixels &&
+      kfilm_convert->pass_adaptive_aux_buffer != PASS_UNUSED) {
+    if (buffer[kfilm_convert->pass_adaptive_aux_buffer + 3] == 0.0f) {
+      const float3 active_rgb = make_float3(1.0f, 0.0f, 0.0f);
+      const float3 mix_rgb = interp(make_float3(pixel[0], pixel[1], pixel[2]), active_rgb, 0.5f);
+      pixel[0] = mix_rgb.x;
+      pixel[1] = mix_rgb.y;
+      pixel[2] = mix_rgb.z;
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
deleted file mode 100644
index 70aed6d54ed..00000000000
--- a/intern/cycles/kernel/kernel_globals.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Constant Globals */
-
-#ifndef __KERNEL_GLOBALS_H__
-#define __KERNEL_GLOBALS_H__
-
-#include "kernel/kernel_profiling.h"
-
-#ifdef __KERNEL_CPU__
-#  include "util/util_map.h"
-#  include "util/util_vector.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "util/util_atomic.h"
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
- * the kernel, to access constant data. These are all stored as "textures", but
- * these are really just standard arrays. We can't use actually globals because
- * multiple renders may be running inside the same process. */
-
-#ifdef __KERNEL_CPU__
-
-#  ifdef __OSL__
-struct OSLGlobals;
-struct OSLThreadData;
-struct OSLShadingSystem;
-#  endif
-
-typedef unordered_map<float, float> CoverageMap;
-
-struct Intersection;
-struct VolumeStep;
-
-typedef struct KernelGlobals {
-#  define KERNEL_TEX(type, name) texture<type> name;
-#  include "kernel/kernel_textures.h"
-
-  KernelData __data;
-
-#  ifdef __OSL__
-  /* On the CPU, we also have the OSL globals here. Most data structures are shared
-   * with SVM, the difference is in the shaders and object/mesh attributes. */
-  OSLGlobals *osl;
-  OSLShadingSystem *osl_ss;
-  OSLThreadData *osl_tdata;
-#  endif
-
-  /* **** Run-time data **** */
-
-  /* Heap-allocated storage for transparent shadows intersections. */
-  Intersection *transparent_shadow_intersections;
-
-  /* Storage for decoupled volume steps. */
-  VolumeStep *decoupled_volume_steps[2];
-  int decoupled_volume_steps_index;
-
-  /* A buffer for storing per-pixel coverage for Cryptomatte. */
-  CoverageMap *coverage_object;
-  CoverageMap *coverage_material;
-  CoverageMap *coverage_asset;
-
-  /* split kernel */
-  SplitData split_data;
-  SplitParams split_param_data;
-
-  int2 global_size;
-  int2 global_id;
-
-  ProfilingState profiler;
-} KernelGlobals;
-
-#endif /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_OPTIX__
-
-typedef struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-} ShaderParams;
-
-typedef struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-} KernelParams;
-
-typedef struct KernelGlobals {
-#  ifdef __VOLUME__
-  VolumeState volume_state;
-#  endif
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-extern "C" __constant__ KernelParams __params;
-
-#else /* __KERNEL_OPTIX__ */
-
-/* For CUDA, constant memory textures must be globals, so we can't put them
- * into a struct. As a result we don't actually use this struct and use actual
- * globals and simply pass along a NULL pointer everywhere, which we hope gets
- * optimized out. */
-
-#  ifdef __KERNEL_CUDA__
-
-__constant__ KernelData __data;
-typedef struct KernelGlobals {
-  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-#    define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#    include "kernel/kernel_textures.h"
-
-#  endif /* __KERNEL_CUDA__ */
-
-#endif /* __KERNEL_OPTIX__ */
-
-/* OpenCL */
-
-#ifdef __KERNEL_OPENCL__
-
-#  define KERNEL_TEX(type, name) typedef type name##_t;
-#  include "kernel/kernel_textures.h"
-
-typedef ccl_addr_space struct KernelGlobals {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-
-#  ifdef __SPLIT_KERNEL__
-  SplitData split_data;
-  SplitParams split_param_data;
-#  endif
-} KernelGlobals;
-
-#  define KERNEL_BUFFER_PARAMS \
-    ccl_global char *buffer0, ccl_global char *buffer1, ccl_global char *buffer2, \
-        ccl_global char *buffer3, ccl_global char *buffer4, ccl_global char *buffer5, \
-        ccl_global char *buffer6, ccl_global char *buffer7
-
-#  define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
-
-ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    kg->buffers[0] = buffer0;
-    kg->buffers[1] = buffer1;
-    kg->buffers[2] = buffer2;
-    kg->buffers[3] = buffer3;
-    kg->buffers[4] = buffer4;
-    kg->buffers[5] = buffer5;
-    kg->buffers[6] = buffer6;
-    kg->buffers[7] = buffer7;
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    ccl_global TextureInfo *info = (ccl_global TextureInfo *)kg->buffers[0];
-
-#  define KERNEL_TEX(type, name) kg->name = *(info++);
-#  include "kernel/kernel_textures.h"
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-#endif /* __KERNEL_OPENCL__ */
-
-/* Interpolated lookup table access */
-
-ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
-{
-  x = saturate(x) * (size - 1);
-
-  int index = min(float_to_int(x), size - 1);
-  int nindex = min(index + 1, size - 1);
-  float t = x - index;
-
-  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-ccl_device float lookup_table_read_2D(
-    KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
-{
-  y = saturate(y) * (ysize - 1);
-
-  int index = min(float_to_int(y), ysize - 1);
-  int nindex = min(index + 1, ysize - 1);
-  float t = y - index;
-
-  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
index 1ca42e933d1..ed01f494f98 100644
--- a/intern/cycles/kernel/kernel_id_passes.h
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -14,8 +14,18 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
+/* Element of ID pass stored in the render buffers.
+ * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
+ * render buffers might not meet expected by compiler alignment. */
+typedef struct IDPassBufferElement {
+  float x;
+  float y;
+} IDPassBufferElement;
+
 ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
                                              int num_slots,
                                              float id,
@@ -27,7 +37,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
   }
 
   for (int slot = 0; slot < num_slots; slot++) {
-    ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+    ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
 #ifdef __ATOMIC_PASS_WRITE__
     /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
     if (id_buffer[slot].x == ID_NONE) {
@@ -65,7 +75,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
 
 ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
 {
-  ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+  ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
   for (int slot = 1; slot < num_slots; ++slot) {
     if (id_buffer[slot].x == ID_NONE) {
       return;
@@ -73,7 +83,7 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
     /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
     int i = slot;
     while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
-      float2 swap = id_buffer[i];
+      const IDPassBufferElement swap = id_buffer[i];
       id_buffer[i] = id_buffer[i - 1];
       id_buffer[i - 1] = swap;
       --i;
@@ -81,19 +91,16 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
   }
 }
 
-#ifdef __KERNEL_GPU__
 /* post-sorting for Cryptomatte */
-ccl_device void kernel_cryptomatte_post(
-    KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+ccl_device_inline void kernel_cryptomatte_post(const KernelGlobals *kg,
+                                               ccl_global float *render_buffer,
+                                               int pixel_index)
 {
-  if (sample - 1 == kernel_data.integrator.aa_samples) {
-    int index = offset + x + y * stride;
-    int pass_stride = kernel_data.film.pass_stride;
-    ccl_global float *cryptomatte_buffer = buffer + index * pass_stride +
-                                           kernel_data.film.pass_cryptomatte;
-    kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-  }
+  const int pass_stride = kernel_data.film.pass_stride;
+  const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
+  ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
+                                         kernel_data.film.pass_cryptomatte;
+  kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
 }
-#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index f4e60a807f7..354e8115538 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,93 +14,27 @@
  * limitations under the License.
  */
 
-/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
-
+#pragma once
 CCL_NAMESPACE_BEGIN
 
-/* "Correlated Multi-Jittered Sampling"
- * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */
-
-/* TODO: find good value, suggested 64 gives pattern on cornell box ceiling. */
-#define CMJ_RANDOM_OFFSET_LIMIT 4096
-
-ccl_device_inline bool cmj_is_pow2(int i)
+ccl_device_inline uint32_t laine_karras_permutation(uint32_t x, uint32_t seed)
 {
-  return (i > 1) && ((i & (i - 1)) == 0);
-}
+  x += seed;
+  x ^= (x * 0x6c50b47cu);
+  x ^= x * 0xb82f1e52u;
+  x ^= x * 0xc7afe638u;
+  x ^= x * 0x8d22f6e6u;
 
-ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
-{
-  return (a & (b - 1));
+  return x;
 }
 
-/* b must be > 1 */
-ccl_device_inline int cmj_fast_div_pow2(int a, int b)
+ccl_device_inline uint32_t nested_uniform_scramble(uint32_t x, uint32_t seed)
 {
-  kernel_assert(b > 1);
-  return a >> count_trailing_zeros(b);
-}
+  x = reverse_integer_bits(x);
+  x = laine_karras_permutation(x, seed);
+  x = reverse_integer_bits(x);
 
-ccl_device_inline uint cmj_w_mask(uint w)
-{
-  kernel_assert(w > 1);
-  return ((1 << (32 - count_leading_zeros(w))) - 1);
-}
-
-ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
-{
-  uint w = l - 1;
-
-  if ((l & w) == 0) {
-    /* l is a power of two (fast) */
-    i ^= p;
-    i *= 0xe170893d;
-    i ^= p >> 16;
-    i ^= (i & w) >> 4;
-    i ^= p >> 8;
-    i *= 0x0929eb3f;
-    i ^= p >> 23;
-    i ^= (i & w) >> 1;
-    i *= 1 | p >> 27;
-    i *= 0x6935fa69;
-    i ^= (i & w) >> 11;
-    i *= 0x74dcb303;
-    i ^= (i & w) >> 2;
-    i *= 0x9e501cc3;
-    i ^= (i & w) >> 2;
-    i *= 0xc860a3df;
-    i &= w;
-    i ^= i >> 5;
-
-    return (i + p) & w;
-  }
-  else {
-    /* l is not a power of two (slow) */
-    w = cmj_w_mask(w);
-
-    do {
-      i ^= p;
-      i *= 0xe170893d;
-      i ^= p >> 16;
-      i ^= (i & w) >> 4;
-      i ^= p >> 8;
-      i *= 0x0929eb3f;
-      i ^= p >> 23;
-      i ^= (i & w) >> 1;
-      i *= 1 | p >> 27;
-      i *= 0x6935fa69;
-      i ^= (i & w) >> 11;
-      i *= 0x74dcb303;
-      i ^= (i & w) >> 2;
-      i *= 0x9e501cc3;
-      i ^= (i & w) >> 2;
-      i *= 0xc860a3df;
-      i &= w;
-      i ^= i >> 5;
-    } while (i >= l);
-
-    return (i + p) % l;
-  }
+  return x;
 }
 
 ccl_device_inline uint cmj_hash(uint i, uint p)
@@ -133,99 +67,101 @@ ccl_device_inline float cmj_randfloat(uint i, uint p)
   return cmj_hash(i, p) * (1.0f / 4294967808.0f);
 }
 
-#ifdef __CMJ__
-ccl_device float cmj_sample_1D(int s, int N, int p)
+ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
 {
-  kernel_assert(s < N);
-
-  uint x = cmj_permute(s, N, p * 0x68bc21eb);
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-
-  float invN = 1.0f / N;
-  return (x + jx) * invN;
+  return cmj_hash_simple(i, p) * (1.0f / (float)0xFFFFFFFF);
 }
 
-/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
-ccl_device_inline int cmj_isqrt(int value)
+ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
 {
-#  if defined(__KERNEL_CUDA__)
-  return float_to_int(__fsqrt_ru(value));
-#  elif defined(__KERNEL_GPU__)
-  return float_to_int(sqrtf(value));
-#  else
-  /* This is a work around for fast-math on CPU which might replace sqrtf()
-   * with am approximated version.
-   */
-  return float_to_int(sqrtf(value) + 1e-6f);
-#  endif
-}
+  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
+   *  the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
+   * independently). */
+
+  /* Perform Owen shuffle of the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
-{
-  kernel_assert(s < N);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
+
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
 
-  int m = cmj_isqrt(N);
-  int n = (N - 1) / m + 1;
-  float invN = 1.0f / N;
-  float invm = 1.0f / m;
-  float invn = 1.0f / n;
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+#  else
+  /* Only jitter within the grid interval. */
+  float dx = cmj_randfloat(d, rng_hash);
+#  endif
+  fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
+  fx = fx - floorf(fx);
 
-  s = cmj_permute(s, N, p * 0x51633e2d);
+#else
+#  warning "Not using Cranley-Patterson Rotation."
+#endif
 
-  int sdivm, smodm;
+  return fx;
+}
 
-  if (cmj_is_pow2(m)) {
-    sdivm = cmj_fast_div_pow2(s, m);
-    smodm = cmj_fast_mod_pow2(s, m);
-  }
-  else {
-    /* Doing `s * inmv` gives precision issues here. */
-    sdivm = s / m;
-    smodm = s - sdivm * m;
-  }
+ccl_device void pmj_sample_2D(
+    const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension, float *x, float *y)
+{
+  /* Perform a shuffle on the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-  uint sx = cmj_permute(smodm, m, p * 0x68bc21eb);
-  uint sy = cmj_permute(sdivm, n, p * 0x02e5be93);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
 
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-  float jy = cmj_randfloat(s, p * 0x368cc8b7);
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
+  float fy = kernel_tex_fetch(__sample_pattern_lut, index + 1);
 
-  *fx = (sx + (sy + jx) * invn) * invm;
-  *fy = (s + jy) * invN;
-}
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+  float dy = cmj_randfloat_simple(d + 1, rng_hash);
+#  else
+  float dx = cmj_randfloat(d, rng_hash);
+  float dy = cmj_randfloat(d + 1, rng_hash);
+#  endif
+  /* Only jitter within the grid cells. */
+  fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
+  fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
+  fx = fx - floorf(fx);
+  fy = fy - floorf(fy);
+#else
+#  warning "Not using Cranley Patterson Rotation."
 #endif
 
-ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  /* Fallback to random */
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    return cmj_randfloat(sample, p);
-  }
-  else {
-    const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f;
-  }
-}
-
-ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    const float fx = cmj_randfloat(sample, p);
-    const float fy = cmj_randfloat(sample, p + 1);
-    return make_float2(fx, fy);
-  }
-  else {
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff;
-    const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f;
-    const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) -
-                     1.0f;
-    return make_float2(fx, fy);
-  }
+  (*x) = fx;
+  (*y) = fy;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 42a834d2ce3..52f641634b9 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -14,7 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "geom/geom.h"
+
 #include "kernel_light_background.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+#include "kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -37,10 +44,22 @@ typedef struct LightSample {
 
 /* Regular Light */
 
-ccl_device_inline bool lamp_light_sample(
-    KernelGlobals *kg, int lamp, float randu, float randv, float3 P, LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_inline bool light_sample(const KernelGlobals *kg,
+                                    const int lamp,
+                                    const float randu,
+                                    const float randv,
+                                    const float3 P,
+                                    const int path_flag,
+                                    LightSample *ls)
 {
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+      return false;
+    }
+  }
+
   LightType type = (LightType)klight->type;
   ls->type = type;
   ls->shader = klight->shader_id;
@@ -50,6 +69,18 @@ ccl_device_inline bool lamp_light_sample(
   ls->u = randu;
   ls->v = randv;
 
+  if (in_volume_segment && (type == LIGHT_DISTANT || type == LIGHT_BACKGROUND)) {
+    /* Distant lights in a volume get a dummy sample, position will not actually
+     * be used in that case. Only when sampling from a specific scatter position
+     * do we actually need to evaluate these. */
+    ls->P = zero_float3();
+    ls->Ng = zero_float3();
+    ls->D = zero_float3();
+    ls->pdf = true;
+    ls->t = FLT_MAX;
+    return true;
+  }
+
   if (type == LIGHT_DISTANT) {
     /* distant light */
     float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
@@ -123,13 +154,15 @@ ccl_device_inline bool lamp_light_sample(
       float invarea = fabsf(klight->area.invarea);
       bool is_round = (klight->area.invarea < 0.0f);
 
-      if (dot(ls->P - P, Ng) > 0.0f) {
-        return false;
+      if (!in_volume_segment) {
+        if (dot(ls->P - P, Ng) > 0.0f) {
+          return false;
+        }
       }
 
       float3 inplane;
 
-      if (is_round) {
+      if (is_round || in_volume_segment) {
         inplane = ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
         ls->P += inplane;
         ls->pdf = invarea;
@@ -176,79 +209,180 @@ ccl_device_inline bool lamp_light_sample(
   return (ls->pdf > 0.0f);
 }
 
-ccl_device bool lamp_light_eval(
-    KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+ccl_device bool lights_intersect(const KernelGlobals *ccl_restrict kg,
+                                 const Ray *ccl_restrict ray,
+                                 Intersection *ccl_restrict isect,
+                                 const int last_prim,
+                                 const int last_object,
+                                 const int last_type,
+                                 const int path_flag)
 {
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
-  LightType type = (LightType)klight->type;
-  ls->type = type;
-  ls->shader = klight->shader_id;
-  ls->object = PRIM_NONE;
-  ls->prim = PRIM_NONE;
-  ls->lamp = lamp;
-  /* todo: missing texture coordinates */
-  ls->u = 0.0f;
-  ls->v = 0.0f;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
 
-  if (!(ls->shader & SHADER_USE_MIS))
-    return false;
+    if (path_flag & PATH_RAY_CAMERA) {
+      if (klight->shader_id & SHADER_EXCLUDE_CAMERA) {
+        continue;
+      }
+    }
+    else {
+      if (!(klight->shader_id & SHADER_USE_MIS)) {
+        continue;
+      }
+    }
 
-  if (type == LIGHT_DISTANT) {
-    /* distant light */
-    float radius = klight->distant.radius;
+    if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+      if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+        continue;
+      }
+    }
 
-    if (radius == 0.0f)
-      return false;
-    if (t != FLT_MAX)
-      return false;
+    LightType type = (LightType)klight->type;
+    float t = 0.0f, u = 0.0f, v = 0.0f;
 
-    /* a distant light is infinitely far away, but equivalent to a disk
-     * shaped light exactly 1 unit away from the current shading point.
-     *
-     *     radius              t^2/cos(theta)
-     *  <---------->           t = sqrt(1^2 + tan(theta)^2)
-     *       tan(th)           area = radius*radius*pi
-     *       <----->
-     *        \    |           (1 + tan(theta)^2)/cos(theta)
-     *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
-     *       t  \th| 1         simplifies to
-     *           \-|           1/(cos(theta)^3)
-     *            \|           magic!
-     *             P
-     */
+    if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+      /* Sphere light. */
+      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+      const float radius = klight->spot.radius;
+      if (radius == 0.0f) {
+        continue;
+      }
 
-    float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-    float costheta = dot(-lightD, D);
-    float cosangle = klight->distant.cosangle;
+      float3 P;
+      if (!ray_aligned_disk_intersect(ray->P, ray->D, ray->t, lightP, radius, &P, &t)) {
+        continue;
+      }
+    }
+    else if (type == LIGHT_AREA) {
+      /* Area light. */
+      const float invarea = fabsf(klight->area.invarea);
+      const bool is_round = (klight->area.invarea < 0.0f);
+      if (invarea == 0.0f) {
+        continue;
+      }
 
-    if (costheta < cosangle)
-      return false;
+      const float3 axisu = make_float3(
+          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+      const float3 axisv = make_float3(
+          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+      const float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
 
-    ls->P = -D;
-    ls->Ng = -D;
-    ls->D = D;
-    ls->t = FLT_MAX;
+      /* One sided. */
+      if (dot(ray->D, Ng) >= 0.0f) {
+        continue;
+      }
 
-    /* compute pdf */
-    float invarea = klight->distant.invarea;
-    ls->pdf = invarea / (costheta * costheta * costheta);
-    ls->eval_fac = ls->pdf;
+      const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+
+      float3 P;
+      if (!ray_quad_intersect(
+              ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) {
+        continue;
+      }
+    }
+    else {
+      continue;
+    }
+
+    if (t < isect->t &&
+        !(last_prim == lamp && last_object == OBJECT_NONE && last_type == PRIMITIVE_LAMP)) {
+      isect->t = t;
+      isect->u = u;
+      isect->v = v;
+      isect->type = PRIMITIVE_LAMP;
+      isect->prim = lamp;
+      isect->object = OBJECT_NONE;
+    }
+  }
+
+  return isect->prim != PRIM_NONE;
+}
+
+ccl_device bool light_sample_from_distant_ray(const KernelGlobals *ccl_restrict kg,
+                                              const float3 ray_D,
+                                              const int lamp,
+                                              LightSample *ccl_restrict ls)
+{
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  const int shader = klight->shader_id;
+  const float radius = klight->distant.radius;
+  const LightType type = (LightType)klight->type;
+
+  if (type != LIGHT_DISTANT) {
+    return false;
+  }
+  if (!(shader & SHADER_USE_MIS)) {
+    return false;
+  }
+  if (radius == 0.0f) {
+    return false;
   }
-  else if (type == LIGHT_POINT || type == LIGHT_SPOT) {
-    float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    float radius = klight->spot.radius;
+  /* a distant light is infinitely far away, but equivalent to a disk
+   * shaped light exactly 1 unit away from the current shading point.
+   *
+   *     radius              t^2/cos(theta)
+   *  <---------->           t = sqrt(1^2 + tan(theta)^2)
+   *       tan(th)           area = radius*radius*pi
+   *       <----->
+   *        \    |           (1 + tan(theta)^2)/cos(theta)
+   *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
+   *       t  \th| 1         simplifies to
+   *           \-|           1/(cos(theta)^3)
+   *            \|           magic!
+   *             P
+   */
+
+  float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+  float costheta = dot(-lightD, ray_D);
+  float cosangle = klight->distant.cosangle;
+
+  if (costheta < cosangle)
+    return false;
 
-    /* sphere light */
-    if (radius == 0.0f)
-      return false;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->u = 0.0f;
+  ls->v = 0.0f;
+  ls->t = FLT_MAX;
+  ls->P = -ray_D;
+  ls->Ng = -ray_D;
+  ls->D = ray_D;
+
+  /* compute pdf */
+  float invarea = klight->distant.invarea;
+  ls->pdf = invarea / (costheta * costheta * costheta);
+  ls->pdf *= kernel_data.integrator.pdf_lights;
+  ls->eval_fac = ls->pdf;
 
-    if (!ray_aligned_disk_intersect(P, D, t, lightP, radius, &ls->P, &ls->t)) {
-      return false;
-    }
+  return true;
+}
 
-    ls->Ng = -D;
-    ls->D = D;
+ccl_device bool light_sample_from_intersection(const KernelGlobals *ccl_restrict kg,
+                                               const Intersection *ccl_restrict isect,
+                                               const float3 ray_P,
+                                               const float3 ray_D,
+                                               LightSample *ccl_restrict ls)
+{
+  const int lamp = isect->prim;
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  LightType type = (LightType)klight->type;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->t = isect->t;
+  ls->P = ray_P + ray_D * ls->t;
+  ls->D = ray_D;
+
+  if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+    ls->Ng = -ray_D;
 
     float invarea = klight->spot.invarea;
     ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
@@ -260,8 +394,9 @@ ccl_device bool lamp_light_eval(
       ls->eval_fac *= spot_light_attenuation(
           dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
 
-      if (ls->eval_fac == 0.0f)
+      if (ls->eval_fac == 0.0f) {
         return false;
+      }
     }
     float2 uv = map_to_sphere(ls->Ng);
     ls->u = uv.x;
@@ -274,31 +409,22 @@ ccl_device bool lamp_light_eval(
   else if (type == LIGHT_AREA) {
     /* area light */
     float invarea = fabsf(klight->area.invarea);
-    bool is_round = (klight->area.invarea < 0.0f);
-    if (invarea == 0.0f)
-      return false;
 
     float3 axisu = make_float3(
         klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
     float3 axisv = make_float3(
         klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
     float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
-    /* one sided */
-    if (dot(D, Ng) >= 0.0f)
-      return false;
-
     float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    if (!ray_quad_intersect(
-            P, D, 0.0f, t, light_P, axisu, axisv, Ng, &ls->P, &ls->t, &ls->u, &ls->v, is_round)) {
-      return false;
-    }
-
-    ls->D = D;
+    ls->u = isect->u;
+    ls->v = isect->v;
+    ls->D = ray_D;
     ls->Ng = Ng;
+
+    const bool is_round = (klight->area.invarea < 0.0f);
     if (is_round) {
-      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t);
+      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -ray_D, ls->t);
     }
     else {
       float3 sample_axisu = axisu;
@@ -306,12 +432,12 @@ ccl_device bool lamp_light_eval(
 
       if (klight->area.tan_spread > 0.0f) {
         if (!light_spread_clamp_area_light(
-                P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
+                ray_P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
           return false;
         }
       }
 
-      ls->pdf = rect_light_sample(P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
+      ls->pdf = rect_light_sample(ray_P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
     }
     ls->eval_fac = 0.25f * invarea;
 
@@ -325,6 +451,7 @@ ccl_device bool lamp_light_eval(
     }
   }
   else {
+    kernel_assert(!"Invalid lamp type in light_sample_from_intersection");
     return false;
   }
 
@@ -337,7 +464,7 @@ ccl_device bool lamp_light_eval(
 
 /* returns true if the triangle is has motion blur or an instancing transform applied */
 ccl_device_inline bool triangle_world_space_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 V[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 V[3])
 {
   bool has_motion = false;
   const int object_flag = kernel_tex_fetch(__object_flag, object);
@@ -365,7 +492,7 @@ ccl_device_inline bool triangle_world_space_vertices(
   return has_motion;
 }
 
-ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
+ccl_device_inline float triangle_light_pdf_area(const KernelGlobals *kg,
                                                 const float3 Ng,
                                                 const float3 I,
                                                 float t)
@@ -379,7 +506,9 @@ ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
   return t * t * pdf / cos_pi;
 }
 
-ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+ccl_device_forceinline float triangle_light_pdf(const KernelGlobals *kg,
+                                                const ShaderData *sd,
+                                                float t)
 {
   /* A naive heuristic to decide between costly solid angle sampling
    * and simple area sampling, comparing the distance to the triangle plane
@@ -448,7 +577,8 @@ ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *s
   }
 }
 
-ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
+template<bool in_volume_segment>
+ccl_device_forceinline void triangle_light_sample(const KernelGlobals *kg,
                                                   int prim,
                                                   int object,
                                                   float randu,
@@ -488,7 +618,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
   float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0));
 
-  if (longest_edge_squared > distance_to_plane * distance_to_plane) {
+  if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) {
     /* see James Arvo, "Stratified Sampling of Spherical Triangles"
      * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
 
@@ -617,7 +747,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
 /* Light Distribution */
 
-ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
+ccl_device int light_distribution_sample(const KernelGlobals *kg, float *randu)
 {
   /* This is basically std::upper_bound as used by PBRT, to find a point light or
    * triangle to emit from, proportional to area. a good improvement would be to
@@ -655,51 +785,93 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 
 /* Generic Light */
 
-ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+ccl_device_inline bool light_select_reached_max_bounces(const KernelGlobals *kg,
+                                                        int index,
+                                                        int bounce)
 {
   return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
 }
 
-ccl_device_noinline bool light_sample(KernelGlobals *kg,
-                                      int lamp,
-                                      float randu,
-                                      float randv,
-                                      float time,
-                                      float3 P,
-                                      int bounce,
-                                      LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_noinline bool light_distribution_sample(const KernelGlobals *kg,
+                                                   float randu,
+                                                   const float randv,
+                                                   const float time,
+                                                   const float3 P,
+                                                   const int bounce,
+                                                   const int path_flag,
+                                                   LightSample *ls)
 {
-  if (lamp < 0) {
-    /* sample index */
-    int index = light_distribution_sample(kg, &randu);
-
-    /* fetch light data */
-    const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(
-        __light_distribution, index);
-    int prim = kdistribution->prim;
-
-    if (prim >= 0) {
-      int object = kdistribution->mesh_light.object_id;
-      int shader_flag = kdistribution->mesh_light.shader_flag;
-
-      triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
-      ls->shader |= shader_flag;
-      return (ls->pdf > 0.0f);
+  /* Sample light index from distribution. */
+  const int index = light_distribution_sample(kg, &randu);
+  const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution,
+                                                                              index);
+  const int prim = kdistribution->prim;
+
+  if (prim >= 0) {
+    /* Mesh light. */
+    const int object = kdistribution->mesh_light.object_id;
+
+    /* Exclude synthetic meshes from shadow catcher pass. */
+    if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
+        !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
+      return false;
     }
 
-    lamp = -prim - 1;
+    const int shader_flag = kdistribution->mesh_light.shader_flag;
+    triangle_light_sample<in_volume_segment>(kg, prim, object, randu, randv, time, ls, P);
+    ls->shader |= shader_flag;
+    return (ls->pdf > 0.0f);
   }
 
+  const int lamp = -prim - 1;
+
   if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
     return false;
   }
 
-  return lamp_light_sample(kg, lamp, randu, randv, P, ls);
+  return light_sample<in_volume_segment>(kg, lamp, randu, randv, P, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_volume_segment(const KernelGlobals *kg,
+                                                                     float randu,
+                                                                     const float randv,
+                                                                     const float time,
+                                                                     const float3 P,
+                                                                     const int bounce,
+                                                                     const int path_flag,
+                                                                     LightSample *ls)
+{
+  return light_distribution_sample<true>(kg, randu, randv, time, P, bounce, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_position(const KernelGlobals *kg,
+                                                               float randu,
+                                                               const float randv,
+                                                               const float time,
+                                                               const float3 P,
+                                                               const int bounce,
+                                                               const int path_flag,
+                                                               LightSample *ls)
+{
+  return light_distribution_sample<false>(kg, randu, randv, time, P, bounce, path_flag, ls);
 }
 
-ccl_device_inline int light_select_num_samples(KernelGlobals *kg, int index)
+ccl_device_inline bool light_distribution_sample_new_position(const KernelGlobals *kg,
+                                                              const float randu,
+                                                              const float randv,
+                                                              const float time,
+                                                              const float3 P,
+                                                              LightSample *ls)
 {
-  return kernel_tex_fetch(__lights, index).samples;
+  /* Sample a new position on the same light, for volume sampling. */
+  if (ls->type == LIGHT_TRIANGLE) {
+    triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P);
+    return (ls->pdf > 0.0f);
+  }
+  else {
+    return light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h
index f0f64ce8704..493ed560bc6 100644
--- a/intern/cycles/kernel/kernel_light_background.h
+++ b/intern/cycles/kernel/kernel_light_background.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "kernel_light_common.h"
 
 CCL_NAMESPACE_BEGIN
@@ -22,7 +24,10 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BACKGROUND_MIS__
 
-ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+ccl_device float3 background_map_sample(const KernelGlobals *kg,
+                                        float randu,
+                                        float randv,
+                                        float *pdf)
 {
   /* for the following, the CDF values are actually a pair of floats, with the
    * function value as X and the actual CDF as Y.  The last entry's function
@@ -104,7 +109,7 @@ ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float ra
 /* TODO(sergey): Same as above, after the release we should consider using
  * 'noinline' for all devices.
  */
-ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
+ccl_device float background_map_pdf(const KernelGlobals *kg, float3 direction)
 {
   float2 uv = direction_to_equirectangular(direction);
   int res_x = kernel_data.background.map_res_x;
@@ -138,7 +143,7 @@ ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
 }
 
 ccl_device_inline bool background_portal_data_fetch_and_check_side(
-    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
+    const KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
 {
   int portal = kernel_data.background.portal_offset + index;
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
@@ -154,7 +159,7 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(
 }
 
 ccl_device_inline float background_portal_pdf(
-    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
+    const KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
 {
   float portal_pdf = 0.0f;
 
@@ -214,7 +219,7 @@ ccl_device_inline float background_portal_pdf(
   return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
 }
 
-ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+ccl_device int background_num_possible_portals(const KernelGlobals *kg, float3 P)
 {
   int num_possible_portals = 0;
   for (int p = 0; p < kernel_data.background.num_portals; p++) {
@@ -225,7 +230,7 @@ ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
   return num_possible_portals;
 }
 
-ccl_device float3 background_portal_sample(KernelGlobals *kg,
+ccl_device float3 background_portal_sample(const KernelGlobals *kg,
                                            float3 P,
                                            float randu,
                                            float randv,
@@ -280,7 +285,7 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
   return zero_float3();
 }
 
-ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
+ccl_device_inline float3 background_sun_sample(const KernelGlobals *kg,
                                                float randu,
                                                float randv,
                                                float *pdf)
@@ -292,7 +297,7 @@ ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
   return D;
 }
 
-ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
+ccl_device_inline float background_sun_pdf(const KernelGlobals *kg, float3 D)
 {
   const float3 N = float4_to_float3(kernel_data.background.sun);
   const float angle = kernel_data.background.sun.w;
@@ -300,7 +305,7 @@ ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
 }
 
 ccl_device_inline float3
-background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+background_light_sample(const KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
@@ -400,7 +405,7 @@ background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, f
   return D;
 }
 
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+ccl_device float background_light_pdf(const KernelGlobals *kg, float3 P, float3 direction)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
index 4a683d36226..765d8f5338e 100644
--- a/intern/cycles/kernel/kernel_light_common.h
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Area light sampling */
@@ -210,7 +214,7 @@ ccl_device bool light_spread_clamp_area_light(const float3 P,
   return true;
 }
 
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+ccl_device float lamp_light_pdf(const KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
   float cos_pi = dot(Ng, I);
 
diff --git a/intern/cycles/kernel/kernel_lookup_table.h b/intern/cycles/kernel/kernel_lookup_table.h
new file mode 100644
index 00000000000..33d9d5ae1f0
--- /dev/null
+++ b/intern/cycles/kernel/kernel_lookup_table.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Interpolated lookup table access */
+
+ccl_device float lookup_table_read(const KernelGlobals *kg, float x, int offset, int size)
+{
+  x = saturate(x) * (size - 1);
+
+  int index = min(float_to_int(x), size - 1);
+  int nindex = min(index + 1, size - 1);
+  float t = x - index;
+
+  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+ccl_device float lookup_table_read_2D(
+    const KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
+{
+  y = saturate(y) * (ysize - 1);
+
+  int index = min(float_to_int(y), ysize - 1);
+  int nindex = min(index + 1, ysize - 1);
+  float t = y - index;
+
+  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 96391db7649..3c5ab95bbc8 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_MATH_H__
-#define __KERNEL_MATH_H__
+#pragma once
 
 #include "util/util_color.h"
 #include "util/util_math.h"
@@ -24,5 +23,3 @@
 #include "util/util_projection.h"
 #include "util/util_texture.h"
 #include "util/util_transform.h"
-
-#endif /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index ce37bd0b15e..b158f4c4fd3 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_MONTECARLO_CL__
-#define __KERNEL_MONTECARLO_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -300,5 +299,3 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_MONTECARLO_CL__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 8f58b8c3079..67466b28170 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,61 +14,52 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/geom/geom.h"
+
 #include "kernel/kernel_id_passes.h"
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __DENOISING_FEATURES__
-
-ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
-                                                     ccl_global float *buffer,
-                                                     int sample,
-                                                     float path_total,
-                                                     float path_total_shaded)
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
 {
-  if (kernel_data.film.pass_denoising_data == 0)
-    return;
-
-  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
-                DENOISING_PASS_SHADOW_B :
-                DENOISING_PASS_SHADOW_A;
-
-  path_total = ensure_finite(path_total);
-  path_total_shaded = ensure_finite(path_total_shaded);
-
-  kernel_write_pass_float(buffer, path_total);
-  kernel_write_pass_float(buffer + 1, path_total_shaded);
-
-  float value = path_total_shaded / max(path_total, 1e-7f);
-  kernel_write_pass_float(buffer + 2, value * value);
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
+#ifdef __DENOISING_FEATURES__
+
+ccl_device_forceinline void kernel_write_denoising_features_surface(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (state->denoising_feature_weight == 0.0f) {
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DENOISING_FEATURES)) {
     return;
   }
 
-  L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
-
   /* Skip implicitly transparent surfaces. */
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return;
   }
 
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
   float3 normal = zero_float3();
   float3 diffuse_albedo = zero_float3();
   float3 specular_albedo = zero_float3();
   float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
-    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
       continue;
+    }
 
     /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
     normal += sc->N * sc->sample_weight;
@@ -106,140 +97,208 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
       normal /= sum_weight;
     }
 
-    /* Transform normal into camera space. */
-    const Transform worldtocamera = kernel_data.cam.worldtocamera;
-    normal = transform_direction(&worldtocamera, normal);
+    if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+      /* Transform normal into camera space. */
+      const Transform worldtocamera = kernel_data.cam.worldtocamera;
+      normal = transform_direction(&worldtocamera, normal);
+
+      const float3 denoising_normal = ensure_finite3(normal);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+    }
 
-    L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
-    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight *
-                                          state->denoising_feature_throughput * diffuse_albedo);
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
+                                                     diffuse_albedo);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
 
-    state->denoising_feature_weight = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
   }
   else {
-    state->denoising_feature_throughput *= specular_albedo;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) *= specular_albedo;
+  }
+}
+
+ccl_device_forceinline void kernel_write_denoising_features_volume(INTEGRATOR_STATE_ARGS,
+                                                                   const float3 albedo,
+                                                                   const bool scatter,
+                                                                   ccl_global float *ccl_restrict
+                                                                       render_buffer)
+{
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+  const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, denoising_feature_throughput);
+
+  if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+    /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+
+    /* Write view direction as normal. */
+    const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+  }
+
+  if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+    /* Write albedo. */
+    const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
   }
 }
 #endif /* __DENOISING_FEATURES__ */
 
-#ifdef __KERNEL_CPU__
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
-ccl_device_inline size_t kernel_write_id_pass_cpu(
-    float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (map) {
-    (*map)[id] += matte_weight;
-    return 0;
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, sd->object_flag)) {
+    return;
   }
-#else /* __KERNEL_CPU__ */
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight)
-ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer,
-                                                   size_t depth,
-                                                   float id,
-                                                   float matte_weight)
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+ccl_device_inline size_t kernel_write_id_pass(float *ccl_restrict buffer,
+                                              size_t depth,
+                                              float id,
+                                              float matte_weight)
 {
-#endif /* __KERNEL_CPU__ */
-  kernel_write_id_slots(buffer, depth, id, matte_weight);
-  return depth * 2;
+  kernel_write_id_slots(buffer, depth * 2, id, matte_weight);
+  return depth * 4;
 }
 
-ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
-                                                ccl_global float *buffer,
-                                                PathRadiance *L,
-                                                ShaderData *sd,
-                                                ccl_addr_space PathState *state,
-                                                float3 throughput)
+ccl_device_inline void kernel_write_data_passes(INTEGRATOR_STATE_ARGS,
+                                                const ShaderData *sd,
+                                                ccl_global float *ccl_restrict render_buffer)
 {
 #ifdef __PASSES__
-  int path_flag = state->flag;
+  const int path_flag = INTEGRATOR_STATE(path, flag);
 
-  if (!(path_flag & PATH_RAY_CAMERA))
+  if (!(path_flag & PATH_RAY_CAMERA)) {
     return;
+  }
 
-  int flag = kernel_data.film.pass_flag;
-  int light_flag = kernel_data.film.light_pass_flag;
+  const int flag = kernel_data.film.pass_flag;
 
-  if (!((flag | light_flag) & PASS_ANY))
+  if (!(flag & PASS_ANY)) {
     return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
 
   if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
     if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
         average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
-      if (state->sample == 0) {
+      if (INTEGRATOR_STATE(path, sample) == 0) {
         if (flag & PASSMASK(DEPTH)) {
-          float depth = camera_z_depth(kg, sd->P);
+          const float depth = camera_z_depth(kg, sd->P);
           kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
         }
         if (flag & PASSMASK(OBJECT_ID)) {
-          float id = object_pass_id(kg, sd->object);
+          const float id = object_pass_id(kg, sd->object);
           kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
         }
         if (flag & PASSMASK(MATERIAL_ID)) {
-          float id = shader_pass_id(kg, sd);
+          const float id = shader_pass_id(kg, sd);
           kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
         }
       }
 
+      if (flag & PASSMASK(POSITION)) {
+        const float3 position = sd->P;
+        kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position);
+      }
       if (flag & PASSMASK(NORMAL)) {
-        float3 normal = shader_bsdf_average_normal(kg, sd);
+        const float3 normal = shader_bsdf_average_normal(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
       }
+      if (flag & PASSMASK(ROUGHNESS)) {
+        const float roughness = shader_bsdf_average_roughness(sd);
+        kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
+      }
       if (flag & PASSMASK(UV)) {
-        float3 uv = primitive_uv(kg, sd);
+        const float3 uv = primitive_uv(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
       }
       if (flag & PASSMASK(MOTION)) {
-        float4 speed = primitive_motion_vector(kg, sd);
+        const float4 speed = primitive_motion_vector(kg, sd);
         kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
         kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
       }
 
-      state->flag |= PATH_RAY_SINGLE_PASS_DONE;
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
     }
   }
 
   if (kernel_data.film.cryptomatte_passes) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
     const float matte_weight = average(throughput) *
                                (1.0f - average(shader_bsdf_transparency(kg, sd)));
     if (matte_weight > 0.0f) {
       ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
       if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-        float id = object_cryptomatte_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+        const float id = object_cryptomatte_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-        float id = shader_cryptomatte_id(kg, sd->shader);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+        const float id = shader_cryptomatte_id(kg, sd->shader);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-        float id = object_cryptomatte_asset_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+        const float id = object_cryptomatte_asset_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
     }
   }
 
-  if (light_flag & PASSMASK_COMPONENT(DIFFUSE))
-    L->color_diffuse += shader_bsdf_diffuse(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(GLOSSY))
-    L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(TRANSMISSION))
-    L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput;
-
-  if (light_flag & PASSMASK(MIST)) {
-    /* bring depth into 0..1 range */
-    float mist_start = kernel_data.film.mist_start;
-    float mist_inv_depth = kernel_data.film.mist_inv_depth;
+  if (flag & PASSMASK(DIFFUSE_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color,
+                             shader_bsdf_diffuse(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(GLOSSY_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color,
+                             shader_bsdf_glossy(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(TRANSMISSION_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
+                             shader_bsdf_transmission(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(MIST)) {
+    /* Bring depth into 0..1 range. */
+    const float mist_start = kernel_data.film.mist_start;
+    const float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-    float depth = camera_distance(kg, sd->P);
+    const float depth = camera_distance(kg, sd->P);
     float mist = saturate((depth - mist_start) * mist_inv_depth);
 
-    /* falloff */
-    float mist_falloff = kernel_data.film.mist_falloff;
+    /* Falloff */
+    const float mist_falloff = kernel_data.film.mist_falloff;
 
     if (mist_falloff == 1.0f)
       ;
@@ -250,158 +309,17 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
     else
       mist = powf(mist, mist_falloff);
 
-    /* modulate by transparency */
-    float3 alpha = shader_bsdf_alpha(kg, sd);
-    L->mist += (1.0f - mist) * average(throughput * alpha);
-  }
-#endif
-}
+    /* Modulate by transparency */
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    const float3 alpha = shader_bsdf_alpha(kg, sd);
+    const float mist_output = (1.0f - mist) * average(throughput * alpha);
 
-ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
-                                                 ccl_global float *buffer,
-                                                 PathRadiance *L)
-{
-#ifdef __PASSES__
-  int light_flag = kernel_data.film.light_pass_flag;
-
-  if (!kernel_data.film.use_light_pass)
-    return;
-
-  if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect,
-                             L->indirect_transmission);
-  if (light_flag & PASSMASK(VOLUME_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume);
-  if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct,
-                             L->direct_transmission);
-  if (light_flag & PASSMASK(VOLUME_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume);
-
-  if (light_flag & PASSMASK(EMISSION))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
-  if (light_flag & PASSMASK(BACKGROUND))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background);
-  if (light_flag & PASSMASK(AO))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao);
-
-  if (light_flag & PASSMASK(DIFFUSE_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
-                             L->color_transmission);
-  if (light_flag & PASSMASK(SHADOW)) {
-    float3 shadow = L->shadow;
-    kernel_write_pass_float4(
-        buffer + kernel_data.film.pass_shadow,
-        make_float4(shadow.x, shadow.y, shadow.z, kernel_data.film.pass_shadow_scale));
+    /* Note that the final value in the render buffer we want is 1 - mist_output,
+     * to avoid having to tracking this in the Integrator state we do the negation
+     * after rendering. */
+    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
   }
-  if (light_flag & PASSMASK(MIST))
-    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist);
 #endif
 }
 
-ccl_device_inline void kernel_write_result(KernelGlobals *kg,
-                                           ccl_global float *buffer,
-                                           int sample,
-                                           PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
-  PROFILING_OBJECT(PRIM_NONE);
-
-  float alpha;
-  float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
-
-  if (kernel_data.film.pass_flag & PASSMASK(COMBINED)) {
-    kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
-  }
-
-  kernel_write_light_passes(kg, buffer, L);
-
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-#  ifdef __SHADOW_TRICKS__
-    kernel_write_denoising_shadow(kg,
-                                  buffer + kernel_data.film.pass_denoising_data,
-                                  sample,
-                                  average(L->path_total),
-                                  average(L->path_total_shaded));
-#  else
-    kernel_write_denoising_shadow(
-        kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
-#  endif
-    if (kernel_data.film.pass_denoising_clean) {
-      float3 noisy, clean;
-      path_radiance_split_denoising(kg, L, &noisy, &clean);
-      kernel_write_pass_float3_variance(
-          buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, noisy);
-      kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, clean);
-    }
-    else {
-      kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                            DENOISING_PASS_COLOR,
-                                        ensure_finite3(L_sum));
-    }
-
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_NORMAL,
-                                      L->denoising_normal);
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_ALBEDO,
-                                      L->denoising_albedo);
-    kernel_write_pass_float_variance(
-        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, L->denoising_depth);
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
-     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
-     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
-     tiles. */
-  if (kernel_data.film.pass_adaptive_aux_buffer &&
-      kernel_data.integrator.adaptive_threshold > 0.0f) {
-    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
-      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
-                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
-    }
-#ifdef __KERNEL_CPU__
-    if ((sample > kernel_data.integrator.adaptive_min_samples) &&
-        kernel_data.integrator.adaptive_stop_per_sample) {
-      const int step = kernel_data.integrator.adaptive_step;
-
-      if ((sample & (step - 1)) == (step - 1)) {
-        kernel_do_adaptive_stopping(kg, buffer, sample);
-      }
-    }
-#endif
-  }
-
-  /* Write the sample count as negative numbers initially to mark the samples as in progress.
-   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
-   * are scaled as if they were taken at a uniform sample count. */
-  if (kernel_data.film.pass_sample_count) {
-    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
-     * passes. */
-#ifdef __ATOMIC_PASS_WRITE__
-    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
-                               0x80000000);
-#else
-    if (buffer[kernel_data.film.pass_sample_count] > 0) {
-      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
-    }
-#endif
-    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
-  }
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
deleted file mode 100644
index 92a097de9e1..00000000000
--- a/intern/cycles/kernel/kernel_path.h
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-// clang-format off
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_passes.h"
-
-#if defined(__VOLUME__) || defined(__SUBSURFACE__)
-#  include "kernel/kernel_volume.h"
-#endif
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
-#endif
-
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg,
-                                                        ccl_addr_space PathState *state,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        PathRadiance *L,
-                                                        const int last_object)
-{
-  PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
-
-  uint visibility = path_state_ray_visibility(kg, state);
-
-  if (path_state_ao_bounce(kg, state)) {
-    ray->t = kernel_data.background.ao_distance;
-    if (last_object != OBJECT_NONE) {
-      const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
-      if (object_ao_distance != 0.0f) {
-        ray->t = object_ao_distance;
-      }
-    }
-  }
-
-  bool hit = scene_intersect(kg, ray, visibility, isect);
-
-  return hit;
-}
-
-ccl_device_forceinline void kernel_path_lamp_emission(KernelGlobals *kg,
-                                                      ccl_addr_space PathState *state,
-                                                      Ray *ray,
-                                                      float3 throughput,
-                                                      ccl_addr_space Intersection *isect,
-                                                      ShaderData *emission_sd,
-                                                      PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
-
-#ifdef __LAMP_MIS__
-  if (kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-    /* ray starting from previous non-transparent bounce */
-    Ray light_ray ccl_optional_struct_init;
-
-    light_ray.P = ray->P - state->ray_t * ray->D;
-    state->ray_t += isect->t;
-    light_ray.D = ray->D;
-    light_ray.t = state->ray_t;
-    light_ray.time = ray->time;
-    light_ray.dD = ray->dD;
-    light_ray.dP = ray->dP;
-
-    /* intersect with lamp */
-    indirect_lamp_emission(kg, emission_sd, state, L, &light_ray, throughput);
-  }
-#endif /* __LAMP_MIS__ */
-}
-
-ccl_device_forceinline void kernel_path_background(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_addr_space Ray *ray,
-                                                   float3 throughput,
-                                                   ShaderData *sd,
-                                                   ccl_global float *buffer,
-                                                   PathRadiance *L)
-{
-  /* eval background shader if nothing hit */
-  if (kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    L->transparent += average(throughput);
-
-#ifdef __PASSES__
-    if (!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND)))
-#endif /* __PASSES__ */
-      return;
-  }
-
-  /* When using the ao bounces approximation, adjust background
-   * shader intensity with ao factor. */
-  if (path_state_ao_bounce(kg, state)) {
-    throughput *= kernel_data.background.ao_bounces_factor;
-  }
-
-#ifdef __BACKGROUND__
-  /* sample background shader */
-  float3 L_background = indirect_background(kg, sd, state, buffer, ray);
-  path_radiance_accum_background(kg, L, state, throughput, L_background);
-#endif /* __BACKGROUND__ */
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  ifdef __VOLUME__
-ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *kg,
-                                                                ShaderData *sd,
-                                                                PathState *state,
-                                                                Ray *ray,
-                                                                float3 *throughput,
-                                                                ccl_addr_space Intersection *isect,
-                                                                bool hit,
-                                                                ShaderData *emission_sd,
-                                                                PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_VOLUME);
-
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return VOLUME_PATH_ATTENUATED;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-#    ifdef __VOLUME_DECOUPLED__
-  int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-  bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
-  bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method);
-
-  if (decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    volume_segment.sampling_method = sampling_method;
-
-    /* emission */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-
-    /* scattering */
-    VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      int all = kernel_data.integrator.sample_all_lights_indirect;
-
-      /* direct light sampling */
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect sample. if we use distance sampling and take just
-       * one sample for direct and indirect light, we could share
-       * this computation, but makes code a bit complex */
-      float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-      float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-      result = kernel_volume_decoupled_scatter(
-          kg, state, &volume_ray, sd, throughput, rphase, rscatter, &volume_segment, NULL, true);
-    }
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-
-    if (result == VOLUME_PATH_SCATTERED) {
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-    else {
-      *throughput *= volume_segment.accum_transmittance;
-    }
-  }
-  else
-#    endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#    ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      /* indirect light bounce */
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-#    endif /* __VOLUME_SCATTER__ */
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-#  endif /* __VOLUME__ */
-
-#endif /* __SPLIT_KERNEL__ */
-
-ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
-                                                     ShaderData *sd,
-                                                     ccl_addr_space PathState *state,
-                                                     ccl_addr_space Ray *ray,
-                                                     float3 throughput,
-                                                     ShaderData *emission_sd,
-                                                     PathRadiance *L,
-                                                     ccl_global float *buffer)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
-
-#ifdef __SHADOW_TRICKS__
-  if (sd->object_flag & SD_OBJECT_SHADOW_CATCHER) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
-      state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_STORE_SHADOW_INFO);
-
-      float3 bg = zero_float3();
-      if (!kernel_data.background.transparent) {
-        bg = indirect_background(kg, emission_sd, state, NULL, ray);
-      }
-      path_radiance_accum_shadowcatcher(L, throughput, bg);
-    }
-  }
-  else if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    /* Only update transparency after shadow catcher bounce. */
-    L->shadow_transparency *= average(shader_bsdf_transparency(kg, sd));
-  }
-#endif /* __SHADOW_TRICKS__ */
-
-  /* holdout */
-#ifdef __HOLDOUT__
-  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-      (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    const float3 holdout_weight = shader_holdout_apply(kg, sd);
-    if (kernel_data.background.transparent) {
-      L->transparent += average(holdout_weight * throughput);
-    }
-    if (isequal_float3(holdout_weight, one_float3())) {
-      return false;
-    }
-  }
-#endif /* __HOLDOUT__ */
-
-  /* holdout mask objects do not write data passes */
-  kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
-
-  /* blurring of bsdf after bounces, for rays that have a small likelihood
-   * of following this particular path (diffuse, rough glossy) */
-  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
-    float blur_pdf = kernel_data.integrator.filter_glossy * state->min_ray_pdf;
-
-    if (blur_pdf < 1.0f) {
-      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
-      shader_bsdf_blur(kg, sd, blur_roughness);
-    }
-  }
-
-#ifdef __EMISSION__
-  /* emission */
-  if (sd->flag & SD_EMISSION) {
-    float3 emission = indirect_primitive_emission(
-        kg, sd, sd->ray_length, state->flag, state->ray_pdf);
-    path_radiance_accum_emission(kg, L, state, throughput, emission);
-  }
-#endif /* __EMISSION__ */
-
-  return true;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    void
-    kernel_path_ao(KernelGlobals *kg,
-                   ShaderData *sd,
-                   ShaderData *emission_sd,
-                   PathRadiance *L,
-                   ccl_addr_space PathState *state,
-                   float3 throughput,
-                   float3 ao_alpha)
-{
-  PROFILING_INIT(kg, PROFILING_AO);
-
-  /* todo: solve correlation */
-  float bsdf_u, bsdf_v;
-
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_D;
-  float ao_pdf;
-
-  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-    Ray light_ray;
-    float3 ao_shadow;
-
-    light_ray.P = ray_offset(sd->P, sd->Ng);
-    light_ray.D = ao_D;
-    light_ray.t = kernel_data.background.ao_distance;
-    light_ray.time = sd->time;
-    light_ray.dP = sd->dP;
-    light_ray.dD = differential3_zero();
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-      path_radiance_accum_ao(kg, L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
-    }
-    else {
-      path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
-    }
-  }
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  if defined(__BRANCHED_PATH__) || defined(__BAKING__)
-
-ccl_device void kernel_path_indirect(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ShaderData *emission_sd,
-                                     Ray *ray,
-                                     float3 throughput,
-                                     PathState *state,
-                                     PathRadiance *L,
-                                     const int last_object)
-{
-#    ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#    endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, last_object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L);
-
-#    ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#    endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, sd, NULL, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-      if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, sd, state, NULL, state->flag);
-        shader_prepare_closures(sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, NULL)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#    ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, sd, state, L);
-#    endif
-
-#    ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, sd, emission_sd, L, state, throughput, zero_float3());
-        }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd->flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#    endif /* __SUBSURFACE__ */
-
-#    if defined(__EMISSION__)
-        int all = (kernel_data.integrator.sample_all_lights_indirect) ||
-                  (state->flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#    endif /* defined(__EMISSION__) */
-
-#    ifdef __VOLUME__
-      }
-#    endif
-
-      if (!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#    ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#    endif /* __SUBSURFACE__ */
-}
-
-#  endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
-
-ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
-                                                  PathState *state,
-                                                  float3 throughput,
-                                                  Ray *ray,
-                                                  PathRadiance *L,
-                                                  ccl_global float *buffer,
-                                                  ShaderData *emission_sd)
-{
-  PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
-
-  /* Shader data memory used for both volumes and surfaces, saves stack space. */
-  ShaderData sd;
-
-#  ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#  endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, sd.object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L);
-
-#  ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, &sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#  endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, &sd, buffer, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, &sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#  ifdef __VOLUME__
-      if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, &sd, state, buffer, state->flag);
-        shader_prepare_closures(&sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, &sd, state, ray, throughput, emission_sd, L, buffer)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#  ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, &sd, state, L);
-#  endif
-
-#  ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
-        }
-#  endif /* __AO__ */
-
-#  ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd.flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, &sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#  endif /* __SUBSURFACE__ */
-
-#  ifdef __EMISSION__
-        /* direct lighting */
-        kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
-#  endif /* __EMISSION__ */
-
-#  ifdef __VOLUME__
-      }
-#  endif
-
-      /* compute direct lighting and next bounce */
-      if (!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#  ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#  endif /* __SUBSURFACE__ */
-}
-
-ccl_device void kernel_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
-
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* Initialize random numbers and sample ray. */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  if (ray.t == 0.0f) {
-    return;
-  }
-
-  /* Initialize state. */
-  float3 throughput = one_float3();
-
-  PathRadiance L;
-  path_radiance_init(kg, &L);
-
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-#  ifdef __KERNEL_OPTIX__
-  /* Force struct into local memory to avoid costly spilling on trace calls. */
-  if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */
-    for (int i = 0; i < sizeof(L); ++i)
-      reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0;
-#  endif
-
-  /* Integrate. */
-  kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd);
-
-  kernel_write_result(kg, buffer, sample, &L);
-}
-
-#endif /* __SPLIT_KERNEL__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
deleted file mode 100644
index a1ee1bc107e..00000000000
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *emission_sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space PathState *state,
-                                               float3 throughput)
-{
-  int num_samples = kernel_data.integrator.ao_samples;
-  float num_samples_inv = 1.0f / num_samples;
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-  for (int j = 0; j < num_samples; j++) {
-    float bsdf_u, bsdf_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-    float3 ao_D;
-    float ao_pdf;
-
-    sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-    if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-      Ray light_ray;
-      float3 ao_shadow;
-
-      light_ray.P = ray_offset(sd->P, sd->Ng);
-      light_ray.D = ao_D;
-      light_ray.t = kernel_data.background.ao_distance;
-      light_ray.time = sd->time;
-      light_ray.dP = sd->dP;
-      light_ray.dD = differential3_zero();
-
-      if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-        path_radiance_accum_ao(
-            kg, L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
-      }
-      else {
-        path_radiance_accum_total_ao(L, state, throughput * num_samples_inv, ao_bsdf);
-      }
-    }
-  }
-}
-
-#  ifndef __SPLIT_KERNEL__
-
-#    ifdef __VOLUME__
-ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 *throughput,
-                                                        ccl_addr_space Intersection *isect,
-                                                        bool hit,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L)
-{
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  const int object = sd->object;
-
-#      ifdef __VOLUME_DECOUPLED__
-  /* decoupled ray marching only supported on CPU */
-  if (kernel_data.integrator.volume_decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    /* direct light sampling */
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-
-      int all = kernel_data.integrator.sample_all_lights_direct;
-
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect light sampling */
-      int num_samples = kernel_data.integrator.volume_samples;
-      float num_samples_inv = 1.0f / num_samples;
-
-      for (int j = 0; j < num_samples; j++) {
-        PathState ps = *state;
-        Ray pray = *ray;
-        float3 tp = *throughput;
-
-        /* branch RNG state */
-        path_state_branch(&ps, j, num_samples);
-
-        /* scatter sample. if we use distance sampling and take just one
-         * sample for direct and indirect light, we could share this
-         * computation, but makes code a bit complex */
-        float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
-        float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(
-            kg, &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-        if (result == VOLUME_PATH_SCATTERED &&
-            kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(
-              kg, indirect_sd, emission_sd, &pray, tp * num_samples_inv, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-    }
-
-    /* emission and transmittance */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-    *throughput *= volume_segment.accum_transmittance;
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-  }
-  else
-#      endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* GPU: no decoupled ray marching, scatter probabilistically. */
-    int num_samples = kernel_data.integrator.volume_samples;
-    float num_samples_inv = 1.0f / num_samples;
-
-    /* todo: we should cache the shader evaluations from stepping
-     * through the volume, for now we redo them multiple times */
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      Ray pray = *ray;
-      float3 tp = (*throughput) * num_samples_inv;
-
-      /* branch RNG state */
-      path_state_branch(&ps, j, num_samples);
-
-      VolumeIntegrateResult result = kernel_volume_integrate(
-          kg, &ps, sd, &volume_ray, L, &tp, step_size);
-
-#      ifdef __VOLUME_SCATTER__
-      if (result == VOLUME_PATH_SCATTERED) {
-        /* todo: support equiangular, MIS and all light sampling.
-         * alternatively get decoupled ray marching working on the GPU */
-        kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
-
-        if (kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(kg, indirect_sd, emission_sd, &pray, tp, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-#      endif /* __VOLUME_SCATTER__ */
-    }
-
-    /* todo: avoid this calculation using decoupled ray marching */
-    kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
-  }
-}
-#    endif /* __VOLUME__ */
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-                                                                         ShaderData *sd,
-                                                                         ShaderData *indirect_sd,
-                                                                         ShaderData *emission_sd,
-                                                                         float3 throughput,
-                                                                         float num_samples_adjust,
-                                                                         PathState *state,
-                                                                         PathRadiance *L)
-{
-  float sum_sample_weight = 0.0f;
-#    ifdef __DENOISING_FEATURES__
-  if (state->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#    endif /* __DENOISING_FEATURES__ */
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    /* transparency is not handled here, but in outer loop */
-    if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-      continue;
-    }
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      float3 tp = throughput;
-      Ray bsdf_ray;
-#    ifdef __SHADOW_TRICKS__
-      float shadow_transparency = L->shadow_transparency;
-#    endif
-
-      ps.rng_hash = cmj_hash(state->rng_hash, i);
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, &tp, &ps, &L->state, &bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps.rng_hash = state->rng_hash;
-
-      kernel_path_indirect(
-          kg, indirect_sd, emission_sd, &bsdf_ray, tp * num_samples_inv, &ps, L, sd->object);
-
-      /* for render passes, sum and reset indirect light pass variables
-       * for the next samples */
-      path_radiance_sum_indirect(L);
-      path_radiance_reset_indirect(L);
-
-#    ifdef __SHADOW_TRICKS__
-      L->shadow_transparency = shadow_transparency;
-#    endif
-    }
-  }
-}
-
-#    ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 throughput)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* set up random number generator */
-    uint lcg_state = lcg_state_init(state, 0x68bc21eb);
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = 0; j < num_samples; j++) {
-      PathState hit_state = *state;
-      path_state_branch(&hit_state, j, num_samples);
-      hit_state.rng_hash = bssrdf_rng_hash;
-
-      LocalIntersection ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-      int num_hits = subsurface_scatter_multi_intersect(
-          kg, &ss_isect, sd, &hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-      hit_state.rng_offset += PRNG_BOUNCE_NUM;
-
-#      ifdef __VOLUME__
-      Ray volume_ray = *ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#      endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = 0; hit < num_hits; hit++) {
-        ShaderData bssrdf_sd = *sd;
-        Bssrdf *bssrdf = (Bssrdf *)sc;
-        ClosureType bssrdf_type = sc->type;
-        float bssrdf_roughness = bssrdf->roughness;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect, hit, &bssrdf_sd, &hit_state, bssrdf_type, bssrdf_roughness);
-
-#      ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state.volume_stack[k] = state->volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state.volume_stack);
-        }
-#      endif /* __VOLUME__ */
-
-#      ifdef __EMISSION__
-        /* direct light */
-        if (kernel_data.integrator.use_direct_light) {
-          int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                    (hit_state.flag & PATH_RAY_SHADOW_CATCHER);
-          kernel_branched_path_surface_connect_light(
-              kg, &bssrdf_sd, emission_sd, &hit_state, throughput, num_samples_inv, L, all);
-        }
-#      endif /* __EMISSION__ */
-
-        /* indirect light */
-        kernel_branched_path_surface_indirect_light(
-            kg, &bssrdf_sd, indirect_sd, emission_sd, throughput, num_samples_inv, &hit_state, L);
-      }
-    }
-  }
-}
-#    endif /* __SUBSURFACE__ */
-
-ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
-                                               uint rng_hash,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer,
-                                               PathRadiance *L)
-{
-  /* initialize */
-  float3 throughput = one_float3();
-
-  path_radiance_init(kg, L);
-
-  /* shader data memory used for both volumes and surfaces, saves stack space */
-  ShaderData sd;
-  /* shader data used by emission, shadows, volume stacks, indirect path */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-  /* Main Loop
-   * Here we only handle transparency intersections from the camera ray.
-   * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
-   */
-  for (;;) {
-    /* Find intersection with objects in scene. */
-    Intersection isect;
-    bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L, sd.object);
-
-#    ifdef __VOLUME__
-    /* Volume integration. */
-    kernel_branched_path_volume(
-        kg, &sd, &state, &ray, &throughput, &isect, hit, &indirect_sd, emission_sd, L);
-#    endif /* __VOLUME__ */
-
-    /* Shade background. */
-    if (!hit) {
-      kernel_path_background(kg, &state, &ray, throughput, &sd, buffer, L);
-      break;
-    }
-
-    /* Setup and evaluate shader. */
-    shader_setup_from_ray(kg, &sd, &isect, &ray);
-
-    /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-    if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-      shader_eval_surface(kg, &sd, &state, buffer, state.flag);
-      shader_merge_closures(&sd);
-
-      /* Apply shadow catcher, holdout, emission. */
-      if (!kernel_path_shader_apply(kg, &sd, &state, &ray, throughput, emission_sd, L, buffer)) {
-        break;
-      }
-
-      /* transparency termination */
-      if (state.flag & PATH_RAY_TRANSPARENT) {
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, &state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-      }
-
-#    ifdef __DENOISING_FEATURES__
-      kernel_update_denoising_features(kg, &sd, &state, L);
-#    endif
-
-#    ifdef __AO__
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput);
-      }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-      /* bssrdf scatter to a different location on the same object */
-      if (sd.flag & SD_BSSRDF) {
-        kernel_branched_path_subsurface_scatter(
-            kg, &sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-      }
-#    endif /* __SUBSURFACE__ */
-
-      PathState hit_state = state;
-
-#    ifdef __EMISSION__
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                  (state.flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, &sd, emission_sd, &hit_state, throughput, 1.0f, L, all);
-      }
-#    endif /* __EMISSION__ */
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, &sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L);
-
-      /* continue in case of transparency */
-      throughput *= shader_bsdf_transparency(kg, &sd);
-
-      if (is_zero(throughput))
-        break;
-
-      /* Update Path State */
-      path_state_next(kg, &state, LABEL_TRANSPARENT);
-
-#    ifdef __VOLUME__
-    }
-    else {
-      if (!path_state_volume_next(kg, &state)) {
-        break;
-      }
-    }
-#    endif
-
-    ray.P = ray_offset(sd.P, -sd.Ng);
-    ray.t -= sd.ray_length; /* clipping works through transparent */
-
-#    ifdef __RAY_DIFFERENTIALS__
-    ray.dP = sd.dP;
-    ray.dD.dx = -sd.dI.dx;
-    ray.dD.dy = -sd.dI.dy;
-#    endif /* __RAY_DIFFERENTIALS__ */
-
-#    ifdef __VOLUME__
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#    endif /* __VOLUME__ */
-  }
-}
-
-ccl_device void kernel_branched_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* initialize random numbers and ray */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  /* integrate */
-  PathRadiance L;
-
-  if (ray.t != 0.0f) {
-    kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
-    kernel_write_result(kg, buffer, sample, &L);
-  }
-}
-
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
deleted file mode 100644
index 815767595a9..00000000000
--- a/intern/cycles/kernel/kernel_path_common.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_hash.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_path_trace_setup(
-    KernelGlobals *kg, int sample, int x, int y, uint *rng_hash, ccl_addr_space Ray *ray)
-{
-  float filter_u;
-  float filter_v;
-
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
-
-  /* sample camera ray */
-
-  float lens_u = 0.0f, lens_v = 0.0f;
-
-  if (kernel_data.cam.aperturesize > 0.0f)
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-  float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-  if (kernel_data.cam.shuttertime != -1.0f)
-    time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
-#endif
-
-  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index bf601580cd0..ebb2c0df4f1 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -14,99 +14,116 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
+#pragma once
 
-ccl_device_inline void path_state_init(KernelGlobals *kg,
-                                       ShaderData *stack_sd,
-                                       ccl_addr_space PathState *state,
-                                       uint rng_hash,
-                                       int sample,
-                                       ccl_addr_space Ray *ray)
-{
-  state->flag = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | PATH_RAY_TRANSPARENT_BACKGROUND;
+#include "kernel_random.h"
 
-  state->rng_hash = rng_hash;
-  state->rng_offset = PRNG_BASE_NUM;
-  state->sample = sample;
-  state->num_samples = kernel_data.integrator.aa_samples;
-  state->branch_factor = 1.0f;
+CCL_NAMESPACE_BEGIN
 
-  state->bounce = 0;
-  state->diffuse_bounce = 0;
-  state->glossy_bounce = 0;
-  state->transmission_bounce = 0;
-  state->transparent_bounce = 0;
+/* Initialize queues, so that the this path is considered terminated.
+ * Used for early outputs in the camera ray initialization, as well as initialization of split
+ * states for shadow catcher. */
+ccl_device_inline void path_state_init_queues(INTEGRATOR_STATE_ARGS)
+{
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
 
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-    state->flag |= PATH_RAY_STORE_SHADOW_INFO;
-    state->denoising_feature_weight = 1.0f;
-    state->denoising_feature_throughput = one_float3();
-  }
-  else {
-    state->denoising_feature_weight = 0.0f;
-    state->denoising_feature_throughput = zero_float3();
-  }
-#endif /* __DENOISING_FEATURES__ */
+/* Minimalistic initialization of the path state, which is needed for early outputs in the
+ * integrator initialization to work. */
+ccl_device_inline void path_state_init(INTEGRATOR_STATE_ARGS,
+                                       const ccl_global KernelWorkTile *ccl_restrict tile,
+                                       const int x,
+                                       const int y)
+{
+  const uint render_pixel_index = (uint)tile->offset + x + y * tile->stride;
 
-  state->min_ray_pdf = FLT_MAX;
-  state->ray_pdf = 0.0f;
-#ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#endif
+  INTEGRATOR_STATE_WRITE(path, render_pixel_index) = render_pixel_index;
 
-#ifdef __VOLUME__
-  state->volume_bounce = 0;
-  state->volume_bounds_bounce = 0;
+  path_state_init_queues(INTEGRATOR_STATE_PASS);
+}
 
-  if (kernel_data.integrator.use_volumes) {
-    /* Initialize volume stack with volume we are inside of. */
-    kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
+/* Initialize the rest of the path state needed to continue the path integration. */
+ccl_device_inline void path_state_init_integrator(INTEGRATOR_STATE_ARGS,
+                                                  const int sample,
+                                                  const uint rng_hash)
+{
+  INTEGRATOR_STATE_WRITE(path, sample) = sample;
+  INTEGRATOR_STATE_WRITE(path, bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, glossy_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transmission_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, rng_hash) = rng_hash;
+  INTEGRATOR_STATE_WRITE(path, rng_offset) = PRNG_BASE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
+                                       PATH_RAY_TRANSPARENT_BACKGROUND;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = kernel_data.background.volume_shader;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
   }
-  else {
-    state->volume_stack[0].shader = SHADER_NONE;
+
+#ifdef __DENOISING_FEATURES__
+  if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
+    INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_DENOISING_FEATURES;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) = one_float3();
   }
 #endif
 }
 
-ccl_device_inline void path_state_next(KernelGlobals *kg,
-                                       ccl_addr_space PathState *state,
-                                       int label)
+ccl_device_inline void path_state_next(INTEGRATOR_STATE_ARGS, int label)
 {
+  uint32_t flag = INTEGRATOR_STATE(path, flag);
+
   /* ray through transparent keeps same flags from previous ray and is
    * not counted as a regular bounce, transparent has separate max */
   if (label & LABEL_TRANSPARENT) {
-    state->flag |= PATH_RAY_TRANSPARENT;
-    state->transparent_bounce++;
-    if (state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_IMMEDIATE;
+    uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+
+    flag |= PATH_RAY_TRANSPARENT;
+    if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+      flag |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
     }
 
     if (!kernel_data.integrator.transparent_shadows)
-      state->flag |= PATH_RAY_MIS_SKIP;
-
-    /* random number generator next bounce */
-    state->rng_offset += PRNG_BOUNCE_NUM;
+      flag |= PATH_RAY_MIS_SKIP;
 
+    INTEGRATOR_STATE_WRITE(path, flag) = flag;
+    INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+    /* Random number generator next bounce. */
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
     return;
   }
 
-  state->bounce++;
-  if (state->bounce >= kernel_data.integrator.max_bounce) {
-    state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+  uint32_t bounce = INTEGRATOR_STATE(path, bounce) + 1;
+  if (bounce >= kernel_data.integrator.max_bounce) {
+    flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
   }
 
-  state->flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
+  flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
 
 #ifdef __VOLUME__
   if (label & LABEL_VOLUME_SCATTER) {
     /* volume scatter */
-    state->flag |= PATH_RAY_VOLUME_SCATTER;
-    state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    flag |= PATH_RAY_VOLUME_SCATTER;
+    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    if (bounce == 1) {
+      flag |= PATH_RAY_VOLUME_PASS;
+    }
 
-    state->volume_bounce++;
-    if (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    const int volume_bounce = INTEGRATOR_STATE(path, volume_bounce) + 1;
+    INTEGRATOR_STATE_WRITE(path, volume_bounce) = volume_bounce;
+    if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+      flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
     }
   }
   else
@@ -114,163 +131,237 @@ ccl_device_inline void path_state_next(KernelGlobals *kg,
   {
     /* surface reflection/transmission */
     if (label & LABEL_REFLECT) {
-      state->flag |= PATH_RAY_REFLECT;
-      state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+      flag |= PATH_RAY_REFLECT;
+      flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
 
       if (label & LABEL_DIFFUSE) {
-        state->diffuse_bounce++;
-        if (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int diffuse_bounce = INTEGRATOR_STATE(path, diffuse_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = diffuse_bounce;
+        if (diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
       else {
-        state->glossy_bounce++;
-        if (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int glossy_bounce = INTEGRATOR_STATE(path, glossy_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, glossy_bounce) = glossy_bounce;
+        if (glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
     }
     else {
       kernel_assert(label & LABEL_TRANSMIT);
 
-      state->flag |= PATH_RAY_TRANSMIT;
+      flag |= PATH_RAY_TRANSMIT;
 
       if (!(label & LABEL_TRANSMIT_TRANSPARENT)) {
-        state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+        flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
       }
 
-      state->transmission_bounce++;
-      if (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
-        state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+      const int transmission_bounce = INTEGRATOR_STATE(path, transmission_bounce) + 1;
+      INTEGRATOR_STATE_WRITE(path, transmission_bounce) = transmission_bounce;
+      if (transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
+        flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
       }
     }
 
     /* diffuse/glossy/singular */
     if (label & LABEL_DIFFUSE) {
-      state->flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
+      flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
     }
     else if (label & LABEL_GLOSSY) {
-      state->flag |= PATH_RAY_GLOSSY;
+      flag |= PATH_RAY_GLOSSY;
     }
     else {
       kernel_assert(label & LABEL_SINGULAR);
-      state->flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+      flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+    }
+
+    /* Render pass categories. */
+    if (bounce == 1) {
+      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
     }
   }
 
-  /* random number generator next bounce */
-  state->rng_offset += PRNG_BOUNCE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = flag;
+  INTEGRATOR_STATE_WRITE(path, bounce) = bounce;
 
-#ifdef __DENOISING_FEATURES__
-  if ((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
-    state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
-  }
-#endif
+  /* Random number generator next bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
 }
 
 #ifdef __VOLUME__
-ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state)
+ccl_device_inline bool path_state_volume_next(INTEGRATOR_STATE_ARGS)
 {
   /* For volume bounding meshes we pass through without counting transparent
    * bounces, only sanity check in case self intersection gets us stuck. */
-  state->volume_bounds_bounce++;
-  if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
+  uint32_t volume_bounds_bounce = INTEGRATOR_STATE(path, volume_bounds_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = volume_bounds_bounce;
+  if (volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
     return false;
   }
 
   /* Random number generator next bounce. */
-  if (state->volume_bounds_bounce > 1) {
-    state->rng_offset += PRNG_BOUNCE_NUM;
+  if (volume_bounds_bounce > 1) {
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
   }
 
   return true;
 }
 #endif
 
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state)
+ccl_device_inline uint path_state_ray_visibility(INTEGRATOR_STATE_CONST_ARGS)
 {
-  uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
 
-  /* for visibility, diffuse/glossy are for reflection only */
-  if (flag & PATH_RAY_TRANSMIT)
-    flag &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
-  /* todo: this is not supported as its own ray visibility yet */
-  if (state->flag & PATH_RAY_VOLUME_SCATTER)
-    flag |= PATH_RAY_DIFFUSE;
+  uint32_t visibility = path_flag & PATH_RAY_ALL_VISIBILITY;
 
-  return flag;
+  /* For visibility, diffuse/glossy are for reflection only. */
+  if (visibility & PATH_RAY_TRANSMIT) {
+    visibility &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
+  }
+
+  /* todo: this is not supported as its own ray visibility yet. */
+  if (path_flag & PATH_RAY_VOLUME_SCATTER) {
+    visibility |= PATH_RAY_DIFFUSE;
+  }
+
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+
+  return visibility;
 }
 
-ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
-                                                            ccl_addr_space PathState *state,
-                                                            const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(INTEGRATOR_STATE_CONST_ARGS,
+                                                            const uint32_t path_flag)
 {
-  if (state->flag & PATH_RAY_TERMINATE_IMMEDIATE) {
-    /* Ray is to be terminated immediately. */
-    return 0.0f;
-  }
-  else if (state->flag & PATH_RAY_TRANSPARENT) {
+  if (path_flag & PATH_RAY_TRANSPARENT) {
+    const uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
-      return 1.0f;
-    }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+    if (transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
       return 1.0f;
     }
-#endif
   }
   else {
+    const uint32_t bounce = INTEGRATOR_STATE(path, bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->bounce <= kernel_data.integrator.min_bounce) {
+    if (bounce <= kernel_data.integrator.min_bounce) {
       return 1.0f;
     }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
-      return 1.0f;
-    }
-#endif
   }
 
   /* Probabilistic termination: use sqrt() to roughly match typical view
    * transform and do path termination a bit later on average. */
-  return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
+  return min(sqrtf(max3(fabs(INTEGRATOR_STATE(path, throughput)))), 1.0f);
 }
 
-/* TODO(DingTo): Find more meaningful name for this */
-ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, bool increase)
+ccl_device_inline bool path_state_ao_bounce(INTEGRATOR_STATE_CONST_ARGS)
 {
-  /* Modify bounce temporarily for shader eval */
-  if (increase)
-    state->bounce += 1;
-  else
-    state->bounce -= 1;
-}
-
-ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
-{
-  if (state->bounce <= kernel_data.integrator.ao_bounces) {
+  if (!kernel_data.integrator.ao_bounces) {
     return false;
   }
 
-  int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+  const int bounce = INTEGRATOR_STATE(path, bounce) - INTEGRATOR_STATE(path, transmission_bounce) -
+                     (INTEGRATOR_STATE(path, glossy_bounce) > 0) + 1;
   return (bounce > kernel_data.integrator.ao_bounces);
 }
 
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
-                                         int branch,
-                                         int num_branches)
+/* Random Number Sampling Utility Functions
+ *
+ * For each random number in each step of the path we must have a unique
+ * dimension to avoid using the same sequence twice.
+ *
+ * For branches in the path we must be careful not to reuse the same number
+ * in a sequence and offset accordingly.
+ */
+
+/* RNG State loaded onto stack. */
+typedef struct RNGState {
+  uint rng_hash;
+  uint rng_offset;
+  int sample;
+} RNGState;
+
+ccl_device_inline void path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset);
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline void shadow_path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  const uint shadow_bounces = INTEGRATOR_STATE(shadow_path, transparent_bounce) -
+                              INTEGRATOR_STATE(path, transparent_bounce);
+
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset) + PRNG_BOUNCE_NUM * shadow_bounces;
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline float path_state_rng_1D(const KernelGlobals *kg,
+                                          const RNGState *rng_state,
+                                          int dimension)
+{
+  return path_rng_1D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(
+    const KernelGlobals *kg, const RNGState *rng_state, int dimension, float *fx, float *fy)
+{
+  path_rng_2D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
+}
+
+ccl_device_inline float path_state_rng_1D_hash(const KernelGlobals *kg,
+                                               const RNGState *rng_state,
+                                               uint hash)
+{
+  /* Use a hash instead of dimension, this is not great but avoids adding
+   * more dimensions to each bounce which reduces quality of dimensions we
+   * are already using. */
+  return path_rng_1D(
+      kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+}
+
+ccl_device_inline float path_branched_rng_1D(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             int branch,
+                                             int num_branches,
+                                             int dimension)
+{
+  return path_rng_1D(kg,
+                     rng_state->rng_hash,
+                     rng_state->sample * num_branches + branch,
+                     rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(const KernelGlobals *kg,
+                                            const RNGState *rng_state,
+                                            int branch,
+                                            int num_branches,
+                                            int dimension,
+                                            float *fx,
+                                            float *fy)
+{
+  path_rng_2D(kg,
+              rng_state->rng_hash,
+              rng_state->sample * num_branches + branch,
+              rng_state->rng_offset + dimension,
+              fx,
+              fy);
+}
+
+/* Utility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(const KernelGlobals *kg,
+                                                         const RNGState *state)
 {
-  if (num_branches > 1) {
-    /* Path is splitting into a branch, adjust so that each branch
-     * still gets a unique sample from the same sequence. */
-    state->sample = state->sample * num_branches + branch;
-    state->num_samples = state->num_samples * num_branches;
-    state->branch_factor *= num_branches;
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
   }
+  return 0.0f;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
deleted file mode 100644
index 97d3f292ca3..00000000000
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    bool
-    kernel_path_subsurface_scatter(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   ShaderData *emission_sd,
-                                   PathRadiance *L,
-                                   ccl_addr_space PathState *state,
-                                   ccl_addr_space Ray *ray,
-                                   ccl_addr_space float3 *throughput,
-                                   ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  PROFILING_INIT(kg, PROFILING_SUBSURFACE);
-
-  float bssrdf_u, bssrdf_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-  const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
-
-  /* do bssrdf scatter step if we picked a bssrdf closure */
-  if (sc) {
-    /* We should never have two consecutive BSSRDF bounces,
-     * the second one should be converted to a diffuse BSDF to
-     * avoid this.
-     */
-    kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
-
-    uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
-
-    LocalIntersection ss_isect;
-    int num_hits = subsurface_scatter_multi_intersect(
-        kg, &ss_isect, sd, state, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-#  ifdef __VOLUME__
-    bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                    sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* compute lighting with the BSDF closure */
-    for (int hit = 0; hit < num_hits; hit++) {
-      /* NOTE: We reuse the existing ShaderData, we assume the path
-       * integration loop stops when this function returns true.
-       */
-      subsurface_scatter_multi_setup(kg, &ss_isect, hit, sd, state, bssrdf_type, bssrdf_roughness);
-
-      kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-      ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-      ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-      PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
-
-      *hit_state = *state;
-      *hit_ray = *ray;
-      *hit_tp = *throughput;
-      *hit_L_state = L->state;
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-      if (kernel_path_surface_bounce(kg, sd, hit_tp, hit_state, hit_L_state, hit_ray)) {
-#  ifdef __LAMP_MIS__
-        hit_state->ray_t = 0.0f;
-#  endif /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          Ray volume_ray = *ray;
-          /* Setup ray from previous surface point to the new one. */
-          volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, &volume_ray.t);
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-        ss_indirect->num_rays++;
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
-    KernelGlobals *kg,
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
-    ccl_addr_space PathState *state,
-    ccl_addr_space Ray *ray,
-    PathRadiance *L,
-    ccl_addr_space float3 *throughput)
-{
-  /* Setup state, ray and throughput for indirect SSS rays. */
-  ss_indirect->num_rays--;
-
-  path_radiance_sum_indirect(L);
-  path_radiance_reset_indirect(L);
-
-  *state = ss_indirect->state[ss_indirect->num_rays];
-  *ray = ss_indirect->rays[ss_indirect->num_rays];
-  L->state = ss_indirect->L_state[ss_indirect->num_rays];
-  *throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-  state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif /* __SUBSURFACE__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
deleted file mode 100644
index ba48c0bdfc4..00000000000
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || \
-    defined(__BAKING__)
-/* branched path tracing: connect path directly to position on one or more lights and add it to L
- */
-ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ShaderData *emission_sd,
-    ccl_addr_space PathState *state,
-    float3 throughput,
-    float num_samples_adjust,
-    PathRadiance *L,
-    int sample_all_lights)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 0;
-  if (kernel_data.integrator.use_direct_light) {
-    if (sample_all_lights) {
-      num_lights = kernel_data.integrator.num_all_lights;
-      if (kernel_data.integrator.pdf_triangles != 0.0f) {
-        num_lights += 1;
-      }
-    }
-    else {
-      num_lights = 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; i++) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = ceil_to_int(num_samples_adjust * light_select_num_samples(kg, i));
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = ceil_to_int(num_samples_adjust * kernel_data.integrator.mesh_light_samples);
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = num_samples_adjust / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#    ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#    endif
-      bool has_emission = false;
-
-      if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-        float terminate = path_branched_rng_light_termination(
-            kg, lamp_rng_hash, state, j, num_samples);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-          /* The sampling probability returned by lamp_light_sample assumes that all lights were
-           * sampled. However, this code only samples lamps, so if the scene also had mesh lights,
-           * the real probability is twice as high. */
-          if (double_pdf) {
-            ls.pdf *= 2.0f;
-          }
-
-          has_emission = direct_emission(
-              kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission) {
-        if (!blocked) {
-          /* accumulate */
-          path_radiance_accum_light(kg,
-                                    L,
-                                    state,
-                                    throughput * num_samples_inv,
-                                    &L_light,
-                                    shadow,
-                                    num_samples_inv,
-                                    is_lamp);
-        }
-        else {
-          path_radiance_accum_total_light(L, state, throughput * num_samples_inv, &L_light);
-        }
-      }
-    }
-  }
-#  endif
-}
-
-/* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const ShaderClosure *sc,
-                                                    int sample,
-                                                    int num_samples,
-                                                    ccl_addr_space float3 *throughput,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadianceState *L_state,
-                                                    ccl_addr_space Ray *ray,
-                                                    float sum_sample_weight)
-{
-  /* sample BSDF */
-  float bsdf_pdf;
-  BsdfEval bsdf_eval ccl_optional_struct_init;
-  float3 bsdf_omega_in ccl_optional_struct_init;
-  differential3 bsdf_domega_in ccl_optional_struct_init;
-  float bsdf_u, bsdf_v;
-  path_branched_rng_2D(
-      kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-  int label;
-
-  label = shader_bsdf_sample_closure(
-      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-#  ifdef __DENOISING_FEATURES__
-  state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
-#  endif
-
-  /* modify path state */
-  path_state_next(kg, state, label);
-
-  /* setup ray */
-  ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-  ray->D = normalize(bsdf_omega_in);
-  ray->t = FLT_MAX;
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = bsdf_domega_in;
-#  endif
-#  ifdef __OBJECT_MOTION__
-  ray->time = sd->time;
-#  endif
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  if (label & LABEL_TRANSMIT)
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif
-
-  /* branch RNG state */
-  path_state_branch(state, sample, num_samples);
-
-  /* set MIS state */
-  state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
-  state->ray_pdf = bsdf_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-
-  return true;
-}
-
-#endif
-
-/* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
-                                                         ShaderData *sd,
-                                                         ShaderData *emission_sd,
-                                                         float3 throughput,
-                                                         ccl_addr_space PathState *state,
-                                                         PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
-
-#ifdef __EMISSION__
-#  ifdef __SHADOW_TRICKS__
-  int all = (state->flag & PATH_RAY_SHADOW_CATCHER);
-  kernel_branched_path_surface_connect_light(kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#  else
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission) {
-    if (!blocked) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-#  endif
-#endif
-}
-
-/* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           ccl_addr_space float3 *throughput,
-                                           ccl_addr_space PathState *state,
-                                           PathRadianceState *L_state,
-                                           ccl_addr_space Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
-
-  /* no BSDF? we can stop here */
-  if (sd->flag & SD_BSDF) {
-    /* sample BSDF */
-    float bsdf_pdf;
-    BsdfEval bsdf_eval ccl_optional_struct_init;
-    float3 bsdf_omega_in ccl_optional_struct_init;
-    differential3 bsdf_domega_in ccl_optional_struct_init;
-    float bsdf_u, bsdf_v;
-    path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-    int label;
-
-    label = shader_bsdf_sample(
-        kg, sd, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-    if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-      return false;
-
-    /* modify throughput */
-    path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-    /* set labels */
-    if (!(label & LABEL_TRANSPARENT)) {
-      state->ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-      state->ray_t = 0.0f;
-#endif
-      state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
-    }
-
-    /* update path state */
-    path_state_next(kg, state, label);
-
-    /* setup ray */
-    ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-    ray->D = normalize(bsdf_omega_in);
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-    ray->dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
-    /* enter/exit volume */
-    if (label & LABEL_TRANSMIT)
-      kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#endif
-    return true;
-  }
-#ifdef __VOLUME__
-  else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-    if (!path_state_volume_next(kg, state)) {
-      return false;
-    }
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-    /* setup ray position, direction stays unchanged */
-    ray->P = ray_offset(sd->P, -sd->Ng);
-#  ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-#  endif
-
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-    return true;
-  }
-#endif
-  else {
-    /* no bsdf or volume? */
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
deleted file mode 100644
index a787910e65c..00000000000
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME_SCATTER__
-
-ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *emission_sd,
-                                                        float3 throughput,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  /* connect to light from given point where shader has been evaluated */
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission && !blocked) {
-    /* accumulate */
-    path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-  }
-#  endif /* __EMISSION__ */
-}
-
-ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space float3 *throughput,
-                                                       ccl_addr_space PathState *state,
-                                                       PathRadianceState *L_state,
-                                                       ccl_addr_space Ray *ray)
-{
-  /* sample phase function */
-  float phase_pdf;
-  BsdfEval phase_eval ccl_optional_struct_init;
-  float3 phase_omega_in ccl_optional_struct_init;
-  differential3 phase_domega_in ccl_optional_struct_init;
-  float phase_u, phase_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
-  int label;
-
-  label = shader_volume_phase_sample(
-      kg, sd, phase_u, phase_v, &phase_eval, &phase_omega_in, &phase_domega_in, &phase_pdf);
-
-  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
-
-  /* set labels */
-  state->ray_pdf = phase_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-  state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
-
-  /* update path state */
-  path_state_next(kg, state, label);
-
-  /* Russian roulette termination of volume ray scattering. */
-  float probability = path_state_continuation_probability(kg, state, *throughput);
-
-  if (probability == 0.0f) {
-    return false;
-  }
-  else if (probability != 1.0f) {
-    /* Use dimension from the previous bounce, has not been used yet. */
-    float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM);
-
-    if (terminate >= probability) {
-      return false;
-    }
-
-    *throughput /= probability;
-  }
-
-  /* setup ray */
-  ray->P = sd->P;
-  ray->D = phase_omega_in;
-  ray->t = FLT_MAX;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = phase_domega_in;
-#  endif
-
-  return true;
-}
-
-#  if !defined(__SPLIT_KERNEL__) && (defined(__BRANCHED_PATH__) || defined(__VOLUME_DECOUPLED__))
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
-                                                          ShaderData *sd,
-                                                          ShaderData *emission_sd,
-                                                          float3 throughput,
-                                                          ccl_addr_space PathState *state,
-                                                          PathRadiance *L,
-                                                          bool sample_all_lights,
-                                                          Ray *ray,
-                                                          const VolumeSegment *segment)
-{
-#    ifdef __EMISSION__
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 1;
-  if (sample_all_lights) {
-    num_lights = kernel_data.integrator.num_all_lights;
-    if (kernel_data.integrator.pdf_triangles != 0.0f) {
-      num_lights += 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; ++i) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = light_select_num_samples(kg, i);
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = kernel_data.integrator.mesh_light_samples;
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = 1.0f / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#      ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#      endif
-      bool has_emission = false;
-
-      float3 tp = throughput;
-
-      if (kernel_data.integrator.use_direct_light) {
-        /* sample random position on random light/triangle */
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        light_sample(kg, lamp, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
-
-        /* sample position on volume segment */
-        float rphase = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
-        float rscatter = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-                                                                       state,
-                                                                       ray,
-                                                                       sd,
-                                                                       &tp,
-                                                                       rphase,
-                                                                       rscatter,
-                                                                       segment,
-                                                                       (ls.t != FLT_MAX) ? &ls.P :
-                                                                                           NULL,
-                                                                       false);
-
-        if (result == VOLUME_PATH_SCATTERED) {
-          /* todo: split up light_sample so we don't have to call it again with new position */
-          if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-            if (double_pdf) {
-              ls.pdf *= 2.0f;
-            }
-
-            /* sample random light */
-            float terminate = path_branched_rng_light_termination(
-                kg, state->rng_hash, state, j, num_samples);
-            has_emission = direct_emission(
-                kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-          }
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission && !blocked) {
-        /* accumulate */
-        path_radiance_accum_light(
-            kg, L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
-      }
-    }
-  }
-#    endif /* __EMISSION__ */
-}
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __VOLUME_SCATTER__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
index 780830879d8..db8644005ea 100644
--- a/intern/cycles/kernel/kernel_profiling.h
+++ b/intern/cycles/kernel/kernel_profiling.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_PROFILING_H__
-#define __KERNEL_PROFILING_H__
+#pragma once
 
 #ifdef __KERNEL_CPU__
 #  include "util/util_profiling.h"
@@ -24,23 +23,18 @@
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_CPU__
-#  define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+#  define PROFILING_INIT(kg, event) \
+    ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event)
 #  define PROFILING_EVENT(event) profiling_helper.set_event(event)
-#  define PROFILING_SHADER(shader) \
-    if ((shader) != SHADER_NONE) { \
-      profiling_helper.set_shader((shader)&SHADER_MASK); \
-    }
-#  define PROFILING_OBJECT(object) \
-    if ((object) != PRIM_NONE) { \
-      profiling_helper.set_object(object); \
-    }
+#  define PROFILING_INIT_FOR_SHADER(kg, event) \
+    ProfilingWithShaderHelper profiling_helper((ProfilingState *)&kg->profiler, event)
+#  define PROFILING_SHADER(object, shader) \
+    profiling_helper.set_shader(object, (shader)&SHADER_MASK);
 #else
 #  define PROFILING_INIT(kg, event)
 #  define PROFILING_EVENT(event)
-#  define PROFILING_SHADER(shader)
-#  define PROFILING_OBJECT(object)
+#  define PROFILING_INIT_FOR_SHADER(kg, event)
+#  define PROFILING_SHADER(object, shader)
 #endif /* __KERNEL_CPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROFILING_H__ */
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index c33d7150b5c..192bf7ca5aa 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_PROJECTION_CL__
-#define __KERNEL_PROJECTION_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -257,5 +256,3 @@ ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROJECTION_CL__ */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
deleted file mode 100644
index d8cc08b3e85..00000000000
--- a/intern/cycles/kernel/kernel_queues.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_QUEUE_H__
-#define __KERNEL_QUEUE_H__
-
-CCL_NAMESPACE_BEGIN
-
-/*
- * Queue utility functions for split kernel
- */
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#  pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#endif
-
-/*
- * Enqueue ray index into the queue
- */
-ccl_device void enqueue_ray_index(
-    int ray_index,               /* Ray index to be enqueued. */
-    int queue_number,            /* Queue in which the ray index should be enqueued. */
-    ccl_global int *queues,      /* Buffer of all queues. */
-    int queue_size,              /* Size of each queue. */
-    ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
-{
-  /* This thread's queue index. */
-  int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint *)&queue_index[queue_number]) +
-                       (queue_number * queue_size);
-  queues[my_queue_index] = ray_index;
-}
-
-/*
- * Get the ray index for this thread
- * Returns a positive ray_index for threads that have to do some work;
- * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
- * i.e All ray's in the queue has been successfully allocated and there
- * is no more ray to allocate to other threads.
- */
-ccl_device int get_ray_index(
-    KernelGlobals *kg,
-    int thread_index,       /* Global thread index. */
-    int queue_number,       /* Queue to operate on. */
-    ccl_global int *queues, /* Buffer of all queues. */
-    int queuesize,          /* Size of a queue. */
-    int empty_queue)        /* Empty the queue slot as soon as we fetch the ray index. */
-{
-  int ray_index = queues[queue_number * queuesize + thread_index];
-  if (empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
-    queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-  }
-  return ray_index;
-}
-
-/* The following functions are to realize Local memory variant of enqueue ray index function. */
-
-/* All threads should call this function. */
-ccl_device void enqueue_ray_index_local(
-    int ray_index,     /* Ray index to enqueue. */
-    int queue_number,  /* Queue in which to enqueue ray index. */
-    char enqueue_flag, /* True for threads whose ray index has to be enqueued. */
-    int queuesize,     /* queue size. */
-    ccl_local_param unsigned int *local_queue_atomics, /* To do local queue atomics. */
-    ccl_global int *Queue_data,                        /* Queues. */
-    ccl_global int *Queue_index)                       /* To do global queue atomics. */
-{
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-
-  /* Get local queue id. */
-  unsigned int lqidx;
-  if (enqueue_flag) {
-    lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue offset. */
-  if (lidx == 0) {
-    *local_queue_atomics = atomic_fetch_and_add_uint32(
-        (ccl_global uint *)&Queue_index[queue_number], *local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue index and enqueue ray. */
-  if (enqueue_flag) {
-    unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
-    Queue_data[my_gqidx] = ray_index;
-  }
-}
-
-ccl_device unsigned int get_local_queue_index(
-    int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-    ccl_local_param unsigned int *local_queue_atomics)
-{
-  int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
-  return my_lqidx;
-}
-
-ccl_device unsigned int get_global_per_queue_offset(
-    int queue_number,
-    ccl_local_param unsigned int *local_queue_atomics,
-    ccl_global int *global_queue_atomics)
-{
-  unsigned int queue_offset = atomic_fetch_and_add_uint32(
-      (ccl_global uint *)&global_queue_atomics[queue_number], local_queue_atomics[queue_number]);
-  return queue_offset;
-}
-
-ccl_device unsigned int get_global_queue_index(
-    int queue_number,
-    int queuesize,
-    unsigned int lqidx,
-    ccl_local_param unsigned int *global_per_queue_offset)
-{
-  int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
-  return my_gqidx;
-}
-
-ccl_device int dequeue_ray_index(int queue_number,
-                                 ccl_global int *queues,
-                                 int queue_size,
-                                 ccl_global int *queue_index)
-{
-  int index = atomic_fetch_and_dec_uint32((ccl_global uint *)&queue_index[queue_number]) - 1;
-
-  if (index < 0) {
-    return QUEUE_EMPTY_SLOT;
-  }
-
-  return queues[index + queue_number * queue_size];
-}
-
-CCL_NAMESPACE_END
-
-#endif  // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 49e5e25c2e0..41b7d76230a 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #include "kernel/kernel_jitter.h"
 #include "util/util_hash.h"
@@ -37,38 +38,34 @@ CCL_NAMESPACE_BEGIN
  */
 #  define SOBOL_SKIP 64
 
-ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
+ccl_device uint sobol_dimension(const KernelGlobals *kg, int index, int dimension)
 {
   uint result = 0;
   uint i = index + SOBOL_SKIP;
   for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
     j += x;
-    result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1);
+    result ^= __float_as_uint(kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1));
   }
   return result;
 }
 
 #endif /* __SOBOL__ */
 
-ccl_device_forceinline float path_rng_1D(
-    KernelGlobals *kg, uint rng_hash, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(const KernelGlobals *kg,
+                                         uint rng_hash,
+                                         int sample,
+                                         int dimension)
 {
 #ifdef __DEBUG_CORRELATION__
   return (float)drand48();
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    return pmj_sample_1D(kg, sample, rng_hash, dimension);
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    return cmj_sample_1D(sample, num_samples, p);
+    return pmj_sample_1D(kg, sample, rng_hash, dimension);
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol sequence value using direction vectors. */
@@ -88,68 +85,72 @@ ccl_device_forceinline float path_rng_1D(
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
-                                        uint rng_hash,
-                                        int sample,
-                                        int num_samples,
-                                        int dimension,
-                                        float *fx,
-                                        float *fy)
+ccl_device_forceinline void path_rng_2D(
+    const KernelGlobals *kg, uint rng_hash, int sample, int dimension, float *fx, float *fy)
 {
 #ifdef __DEBUG_CORRELATION__
   *fx = (float)drand48();
   *fy = (float)drand48();
   return;
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    const float2 f = pmj_sample_2D(kg, sample, rng_hash, dimension);
-    *fx = f.x;
-    *fy = f.y;
-    return;
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    cmj_sample_2D(sample, num_samples, p, fx, fy);
+    pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy);
+
     return;
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol. */
-  *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
-  *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
+  *fx = path_rng_1D(kg, rng_hash, sample, dimension);
+  *fy = path_rng_1D(kg, rng_hash, sample, dimension + 1);
 #endif
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg,
-                                     int sample,
-                                     int num_samples,
-                                     uint *rng_hash,
-                                     int x,
-                                     int y,
-                                     float *fx,
-                                     float *fy)
+/**
+ * 1D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqint1(uint n)
+{
+  n = (n << 13U) ^ n;
+  n = n * (n * n * 15731U + 789221U) + 1376312589U;
+
+  return n;
+}
+
+/**
+ * 2D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqnt2d(const uint x, const uint y)
 {
-  /* load state */
-  *rng_hash = hash_uint2(x, y);
-  *rng_hash ^= kernel_data.integrator.seed;
+  const uint qx = 1103515245U * ((x >> 1U) ^ (y));
+  const uint qy = 1103515245U * ((y >> 1U) ^ (x));
+  const uint n = 1103515245U * ((qx) ^ (qy >> 3U));
+
+  return n;
+}
+
+ccl_device_inline uint path_rng_hash_init(const KernelGlobals *ccl_restrict kg,
+                                          const int sample,
+                                          const int x,
+                                          const int y)
+{
+  const uint rng_hash = hash_iqnt2d(x, y) ^ kernel_data.integrator.seed;
 
 #ifdef __DEBUG_CORRELATION__
-  srand48(*rng_hash + sample);
+  srand48(rng_hash + sample);
+#else
+  (void)sample;
 #endif
 
-  if (sample == 0) {
-    *fx = 0.5f;
-    *fy = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
-  }
+  return rng_hash;
 }
 
 /* Linear Congruential Generator */
@@ -175,113 +176,12 @@ ccl_device uint lcg_init(uint seed)
   return rng;
 }
 
-/* Path Tracing Utility Functions
- *
- * For each random number in each step of the path we must have a unique
- * dimension to avoid using the same sequence twice.
- *
- * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly.
- */
-
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
-                                          const ccl_addr_space PathState *state,
-                                          int dimension)
-{
-  return path_rng_1D(
-      kg, state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_state_rng_2D(
-    KernelGlobals *kg, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
-{
-  path_rng_2D(kg,
-              state->rng_hash,
-              state->sample,
-              state->num_samples,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg,
-                                               const ccl_addr_space PathState *state,
-                                               uint hash)
-{
-  /* Use a hash instead of dimension, this is not great but avoids adding
-   * more dimensions to each bounce which reduces quality of dimensions we
-   * are already using. */
-  return path_rng_1D(kg,
-                     cmj_hash_simple(state->rng_hash, hash),
-                     state->sample,
-                     state->num_samples,
-                     state->rng_offset);
-}
-
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg,
-                                             uint rng_hash,
-                                             const ccl_addr_space PathState *state,
-                                             int branch,
-                                             int num_branches,
-                                             int dimension)
-{
-  return path_rng_1D(kg,
-                     rng_hash,
-                     state->sample * num_branches + branch,
-                     state->num_samples * num_branches,
-                     state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg,
-                                            uint rng_hash,
-                                            const ccl_addr_space PathState *state,
-                                            int branch,
-                                            int num_branches,
-                                            int dimension,
-                                            float *fx,
-                                            float *fy)
-{
-  path_rng_2D(kg,
-              rng_hash,
-              state->sample * num_branches + branch,
-              state->num_samples * num_branches,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-/* Utility functions to get light termination value,
- * since it might not be needed in many cases.
- */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg,
-                                                         const ccl_addr_space PathState *state)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg,
-                                                            uint rng_hash,
-                                                            const ccl_addr_space PathState *state,
-                                                            int branch,
-                                                            int num_branches)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_branched_rng_1D(kg, rng_hash, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline uint lcg_state_init(PathState *state, uint scramble)
-{
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
-}
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(const uint rng_hash,
+                                      const uint rng_offset,
+                                      const uint sample,
+                                      const uint scramble)
 {
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
+  return lcg_init(rng_hash + rng_offset + sample * scramble);
 }
 
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
@@ -301,8 +201,6 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
     return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
 #elif defined(__NVCC__)
     return __popc(sample & 0xaaaaaaaa) & 1;
-#elif defined(__KERNEL_OPENCL__)
-    return popcount(sample & 0xaaaaaaaa) & 1;
 #else
     /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
     int i = sample & 0xaaaaaaaa;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 7f02e6fc7b3..3052bb53040 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 
-/*
- * ShaderData, used in four steps:
- *
- * Setup from incoming ray, sampled position and background.
- * Execute for surface, volume or displacement.
- * Evaluate one or more closures.
- * Release.
- */
+/* Functions to evaluate shaders and use the resulting shader closures. */
+
+#pragma once
 
 // clang-format off
 #include "kernel/closure/alloc.h"
@@ -30,479 +25,39 @@
 #include "kernel/closure/emissive.h"
 // clang-format on
 
+#include "kernel/kernel_accumulate.h"
 #include "kernel/svm/svm.h"
 
-CCL_NAMESPACE_BEGIN
-
-/* ShaderData setup from incoming ray */
-
-#ifdef __OBJECT_MOTION__
-ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
-{
-  if (sd->object_flag & SD_OBJECT_MOTION) {
-    sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-    sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
-  }
-  else {
-    sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-    sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  }
-}
-#endif
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-    void
-    shader_setup_from_ray(KernelGlobals *kg,
-                          ShaderData *sd,
-                          const Intersection *isect,
-                          const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
-                                                isect->object;
-  sd->lamp = LAMP_NONE;
-
-  sd->type = isect->type;
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-
-  /* matrices and time */
-#ifdef __OBJECT_MOTION__
-  shader_setup_object_transforms(kg, sd, ray->time);
-#endif
-  sd->time = ray->time;
-
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->ray_length = isect->t;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE) {
-    /* curve */
-    curve_shader_setup(kg, sd, isect, ray);
-  }
-  else
-#endif
-      if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* static triangle */
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* vectors */
-    sd->P = triangle_refine(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, false);
-  }
-
-  sd->I = -ray->D;
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#endif
-  }
-
-  /* backfacing test */
-  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#endif
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-  differential_incoming(&sd->dI, ray->dD);
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from BSSRDF scatter */
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    void
-    shader_setup_from_subsurface(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  const bool backfacing = sd->flag & SD_BACKFACING;
-
-  /* object, matrices, time, ray_length stay the same */
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->type = isect->type;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-  /* fetch triangle data */
-  if (sd->type == PRIMITIVE_TRIANGLE) {
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* static triangle */
-    sd->P = triangle_refine_local(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#  ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#  endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, true);
-  }
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#  ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#  endif
-  }
-
-  /* backfacing test */
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#  ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#  endif
-  }
-
-  /* should not get used in principle as the shading will only use a diffuse
-   * BSDF, but the shader might still access it */
-  sd->I = sd->N;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-  /* don't modify dP and dI */
-#  endif
-
-  PROFILING_SHADER(sd->shader);
-}
-#endif
-
-/* ShaderData setup from position sampled on mesh */
-
-ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
-                                                ShaderData *sd,
-                                                const float3 P,
-                                                const float3 Ng,
-                                                const float3 I,
-                                                int shader,
-                                                int object,
-                                                int prim,
-                                                float u,
-                                                float v,
-                                                float t,
-                                                float time,
-                                                bool object_space,
-                                                int lamp)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = P;
-  sd->N = Ng;
-  sd->Ng = Ng;
-  sd->I = I;
-  sd->shader = shader;
-  if (prim != PRIM_NONE)
-    sd->type = PRIMITIVE_TRIANGLE;
-  else if (lamp != LAMP_NONE)
-    sd->type = PRIMITIVE_LAMP;
-  else
-    sd->type = PRIMITIVE_NONE;
-
-  /* primitive */
-  sd->object = object;
-  sd->lamp = LAMP_NONE;
-  /* Currently no access to bvh prim index for strand sd->prim. */
-  sd->prim = prim;
-  sd->u = u;
-  sd->v = v;
-  sd->time = time;
-  sd->ray_length = t;
-
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  if (sd->object != OBJECT_NONE) {
-    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
-
-#ifdef __OBJECT_MOTION__
-    shader_setup_object_transforms(kg, sd, time);
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->ob_tfm = lamp_fetch_transform(kg, lamp, false);
-    sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
-    sd->lamp = lamp;
-#else
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->lamp = lamp;
-#endif
-  }
-
-  /* transform into world space */
-  if (object_space) {
-    object_position_transform_auto(kg, sd, &sd->P);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-    sd->N = sd->Ng;
-    object_dir_transform_auto(kg, sd, &sd->I);
-  }
-
-  if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL) {
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-        object_normal_transform_auto(kg, sd, &sd->N);
-      }
-    }
-
-    /* dPdu/dPdv */
-#ifdef __DPDU__
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-
-    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-      object_dir_transform_auto(kg, sd, &sd->dPdu);
-      object_dir_transform_auto(kg, sd, &sd->dPdv);
-    }
-#endif
-  }
-  else {
-#ifdef __DPDU__
-    sd->dPdu = zero_float3();
-    sd->dPdv = zero_float3();
-#endif
-  }
-
-  /* backfacing test */
-  if (sd->prim != PRIM_NONE) {
-    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-    if (backfacing) {
-      sd->flag |= SD_BACKFACING;
-      sd->Ng = -sd->Ng;
-      sd->N = -sd->N;
-#ifdef __DPDU__
-      sd->dPdu = -sd->dPdu;
-      sd->dPdv = -sd->dPdv;
-#endif
-    }
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* no ray differentials here yet */
-  sd->dP = differential3_zero();
-  sd->dI = differential3_zero();
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup for displacement */
-
-ccl_device void shader_setup_from_displace(
-    KernelGlobals *kg, ShaderData *sd, int object, int prim, float u, float v)
-{
-  float3 P, Ng, I = zero_float3();
-  int shader;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  /* force smooth shading for displacement */
-  shader |= SHADER_SMOOTH_NORMAL;
-
-  shader_setup_from_sample(
-      kg,
-      sd,
-      P,
-      Ng,
-      I,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      0.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-}
-
-/* ShaderData setup from ray into background */
-
-ccl_device_inline void shader_setup_from_background(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->D;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = kernel_data.background.surface_shader;
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f;
-
-  sd->object = OBJECT_NONE;
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#endif
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from point inside volume */
-
-#ifdef __VOLUME__
-ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->P;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = SHADER_NONE;
-  sd->flag = 0;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
-
-  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->type = PRIMITIVE_NONE;
-
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#  ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#  endif
-
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-  sd->ray_dP = ray->dP;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-#endif /* __VOLUME__ */
+CCL_NAMESPACE_BEGIN
 
 /* Merging */
 
-#if defined(__BRANCHED_PATH__) || defined(__VOLUME__)
-ccl_device_inline void shader_merge_closures(ShaderData *sd)
+#if defined(__VOLUME__)
+ccl_device_inline void shader_merge_volume_closures(ShaderData *sd)
 {
-  /* merge identical closures, better when we sample a single closure at a time */
+  /* Merge identical closures to save closure space with stacked volumes. */
   for (int i = 0; i < sd->num_closure; i++) {
     ShaderClosure *sci = &sd->closure[i];
 
+    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      continue;
+    }
+
     for (int j = i + 1; j < sd->num_closure; j++) {
       ShaderClosure *scj = &sd->closure[j];
-
-      if (sci->type != scj->type)
+      if (sci->type != scj->type) {
         continue;
-      if (!bsdf_merge(sci, scj))
+      }
+
+      const HenyeyGreensteinVolume *hgi = (const HenyeyGreensteinVolume *)sci;
+      const HenyeyGreensteinVolume *hgj = (const HenyeyGreensteinVolume *)scj;
+      if (!(hgi->g == hgj->g)) {
         continue;
+      }
 
       sci->weight += scj->weight;
       sci->sample_weight += scj->sample_weight;
@@ -520,16 +75,40 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
     }
   }
 }
-#endif /* __BRANCHED_PATH__ || __VOLUME__ */
 
-/* Defensive sampling. */
+ccl_device_inline void shader_copy_volume_phases(ShaderVolumePhases *ccl_restrict phases,
+                                                 const ShaderData *ccl_restrict sd)
+{
+  phases->num_closure = 0;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    const ShaderClosure *from_sc = &sd->closure[i];
+    const HenyeyGreensteinVolume *from_hg = (const HenyeyGreensteinVolume *)from_sc;
+
+    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+      to_sc->weight = from_sc->weight;
+      to_sc->sample_weight = from_sc->sample_weight;
+      to_sc->g = from_hg->g;
+      phases->num_closure++;
+      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+        break;
+      }
+    }
+  }
+}
+#endif /* __VOLUME__ */
 
-ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space PathState *state)
+ccl_device_inline void shader_prepare_surface_closures(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
-  /* We can likely also do defensive sampling at deeper bounces, particularly
+  /* Defensive sampling.
+   *
+   * We can likely also do defensive sampling at deeper bounces, particularly
    * for cases like a perfect mirror but possibly also others. This will need
    * a good heuristic. */
-  if (state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+  if (INTEGRATOR_STATE(path, bounce) + INTEGRATOR_STATE(path, transparent_bounce) == 0 &&
+      sd->num_closure > 1) {
     float sum = 0.0f;
 
     for (int i = 0; i < sd->num_closure; i++) {
@@ -546,98 +125,119 @@ ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space Pa
       }
     }
   }
+
+  /* Filter glossy.
+   *
+   * Blurring of bsdf after bounces, for rays that have a small likelihood
+   * of following this particular path (diffuse, rough glossy) */
+  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
+    float blur_pdf = kernel_data.integrator.filter_glossy * INTEGRATOR_STATE(path, min_ray_pdf);
+
+    if (blur_pdf < 1.0f) {
+      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ShaderClosure *sc = &sd->closure[i];
+        if (CLOSURE_IS_BSDF(sc->type)) {
+          bsdf_blur(kg, sc, blur_roughness);
+        }
+      }
+    }
+  }
 }
 
 /* BSDF */
 
-ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               const float3 omega_in,
-                                               float *pdf,
-                                               const ShaderClosure *skip_sc,
-                                               BsdfEval *result_eval,
-                                               float sum_pdf,
-                                               float sum_sample_weight)
+ccl_device_inline bool shader_bsdf_is_transmission(const ShaderData *sd, const float3 omega_in)
+{
+  return dot(sd->N, omega_in) < 0.0f;
+}
+
+ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
+{
+  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+    return false;
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const float3 omega_in,
+                                                const bool is_transmission,
+                                                const ShaderClosure *skip_sc,
+                                                BsdfEval *result_eval,
+                                                float sum_pdf,
+                                                float sum_sample_weight,
+                                                const uint light_shader_flags)
 {
   /* this is the veach one-sample model with balance heuristic, some pdf
    * factors drop out when using balance heuristic weighting */
   for (int i = 0; i < sd->num_closure; i++) {
     const ShaderClosure *sc = &sd->closure[i];
 
-    if (sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
+    if (sc == skip_sc) {
+      continue;
+    }
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
 
-      if (bsdf_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, 1.0f);
-        sum_pdf += bsdf_pdf * sc->sample_weight;
+        if (bsdf_pdf != 0.0f) {
+          const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                                   CLOSURE_IS_BSDF_BSSRDF(sc->type));
+          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+        }
       }
 
       sum_sample_weight += sc->sample_weight;
     }
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        const float3 omega_in,
-                                                        BsdfEval *result_eval,
-                                                        float light_pdf,
-                                                        bool use_mis)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-    if (CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
-      if (bsdf_pdf != 0.0f) {
-        float mis_weight = use_mis ? power_heuristic(light_pdf, bsdf_pdf) : 1.0f;
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, mis_weight);
-      }
-    }
-  }
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
-#endif /* __BRANCHED_PATH__ */
 
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
 ccl_device_inline
 #endif
-    void
-    shader_bsdf_eval(KernelGlobals *kg,
+    float
+    shader_bsdf_eval(const KernelGlobals *kg,
                      ShaderData *sd,
                      const float3 omega_in,
-                     BsdfEval *eval,
-                     float light_pdf,
-                     bool use_mis)
+                     const bool is_transmission,
+                     BsdfEval *bsdf_eval,
+                     const uint light_shader_flags)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
-
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
+  bsdf_eval_init(bsdf_eval, false, zero_float3());
 
-#ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched)
-    _shader_bsdf_multi_eval_branched(kg, sd, omega_in, eval, light_pdf, use_mis);
-  else
-#endif
-  {
-    float pdf;
-    _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
-    if (use_mis) {
-      float weight = power_heuristic(light_pdf, pdf);
-      bsdf_eval_mis(eval, weight);
-    }
-  }
+  return _shader_bsdf_multi_eval(
+      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
 }
 
-ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *randu)
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline const ShaderClosure *shader_bsdf_bssrdf_pick(const ShaderData *ccl_restrict sd,
+                                                               float *randu)
 {
-  /* Note the sampling here must match shader_bssrdf_pick,
-   * since we reuse the same random number. */
   int sampled = 0;
 
   if (sd->num_closure > 1) {
@@ -674,106 +274,33 @@ ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *r
     }
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSDF(sc->type) ? sc : NULL;
+  return &sd->closure[sampled];
 }
 
-ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
-                                                          ccl_addr_space float3 *throughput,
-                                                          float *randu)
+/* Return weight for picked BSSRDF. */
+ccl_device_inline float3 shader_bssrdf_sample_weight(const ShaderData *ccl_restrict sd,
+                                                     const ShaderClosure *ccl_restrict bssrdf_sc)
 {
-  /* Note the sampling here must match shader_bsdf_pick,
-   * since we reuse the same random number. */
-  int sampled = 0;
+  float3 weight = bssrdf_sc->weight;
 
   if (sd->num_closure > 1) {
-    /* Pick a BSDF or BSSRDF or based on sample weights. */
-    float sum_bsdf = 0.0f;
-    float sum_bssrdf = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF(sc->type)) {
-        sum_bsdf += sc->sample_weight;
-      }
-      else if (CLOSURE_IS_BSSRDF(sc->type)) {
-        sum_bssrdf += sc->sample_weight;
-      }
-    }
-
-    float r = (*randu) * (sum_bsdf + sum_bssrdf);
-    float partial_sum = 0.0f;
-
+    float sum = 0.0f;
     for (int i = 0; i < sd->num_closure; i++) {
       const ShaderClosure *sc = &sd->closure[i];
 
       if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r < next_sum) {
-          if (CLOSURE_IS_BSDF(sc->type)) {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
-            return NULL;
-          }
-          else {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
-            sampled = i;
-
-            /* Rescale to reuse for direction sample, to better preserve stratification. */
-            *randu = (r - partial_sum) / sc->sample_weight;
-            break;
-          }
-        }
-
-        partial_sum = next_sum;
+        sum += sc->sample_weight;
       }
     }
+    weight *= sum / bssrdf_sc->sample_weight;
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSSRDF(sc->type) ? sc : NULL;
-}
-
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float randu,
-                                         float randv,
-                                         BsdfEval *bsdf_eval,
-                                         float3 *omega_in,
-                                         differential3 *domega_in,
-                                         float *pdf)
-{
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
-
-  const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
-  if (sc == NULL) {
-    *pdf = 0.0f;
-    return LABEL_NONE;
-  }
-
-  /* BSSRDF should already have been handled elsewhere. */
-  kernel_assert(CLOSURE_IS_BSDF(sc->type));
-
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
-
-    if (sd->num_closure > 1) {
-      float sweight = sc->sample_weight;
-      _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf * sweight, sweight);
-    }
-  }
-
-  return label;
+  return weight;
 }
 
-ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int shader_bsdf_sample_closure(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           const ShaderClosure *sc,
                                           float randu,
@@ -783,7 +310,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
   int label;
   float3 eval = zero_float3();
@@ -791,19 +319,29 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
   *pdf = 0.0f;
   label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
-  if (*pdf != 0.0f)
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
+  if (*pdf != 0.0f) {
+    const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                             CLOSURE_IS_BSDF_BSSRDF(sc->type));
+    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);
+
+    if (sd->num_closure > 1) {
+      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
+      float sweight = sc->sample_weight;
+      *pdf = _shader_bsdf_multi_eval(
+          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+    }
+  }
 
   return label;
 }
 
-ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
+ccl_device float shader_bsdf_average_roughness(const ShaderData *sd)
 {
   float roughness = 0.0f;
   float sum_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF(sc->type)) {
       /* sqrt once to undo the squaring from multiplying roughness on the
@@ -817,17 +355,7 @@ ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
   return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
 }
 
-ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF(sc->type))
-      bsdf_blur(kg, sc, roughness);
-  }
-}
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return one_float3();
@@ -840,7 +368,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *
   }
 }
 
-ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device void shader_bsdf_disable_transparency(const KernelGlobals *kg, ShaderData *sd)
 {
   if (sd->flag & SD_TRANSPARENT) {
     for (int i = 0; i < sd->num_closure; i++) {
@@ -856,7 +384,7 @@ ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *
   }
 }
 
-ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_alpha(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
 
@@ -866,12 +394,12 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
   return alpha;
 }
 
-ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_diffuse(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) ||
         CLOSURE_IS_BSDF_BSSRDF(sc->type))
@@ -881,12 +409,12 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_glossy(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
       eval += sc->weight;
@@ -895,12 +423,12 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transmission(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
       eval += sc->weight;
@@ -909,12 +437,12 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_average_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
       N += sc->N * fabsf(average(sc->weight));
   }
@@ -922,59 +450,44 @@ ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
   return (is_zero(N)) ? sd->N : normalize(N);
 }
 
-ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
+ccl_device float3 shader_bsdf_ao_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
       const DiffuseBsdf *bsdf = (const DiffuseBsdf *)sc;
-      eval += sc->weight * ao_factor;
       N += bsdf->N * fabsf(average(sc->weight));
     }
   }
 
-  *N_ = (is_zero(N)) ? sd->N : normalize(N);
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 
 #ifdef __SUBSURFACE__
-ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
+ccl_device float3 shader_bssrdf_normal(const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
-  float texture_blur = 0.0f, weight_sum = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSSRDF(sc->type)) {
       const Bssrdf *bssrdf = (const Bssrdf *)sc;
       float avg_weight = fabsf(average(sc->weight));
 
       N += bssrdf->N * avg_weight;
-      eval += sc->weight;
-      texture_blur += bssrdf->texture_blur * avg_weight;
-      weight_sum += avg_weight;
     }
   }
 
-  if (N_)
-    *N_ = (is_zero(N)) ? sd->N : normalize(N);
-
-  if (texture_blur_)
-    *texture_blur_ = safe_divide(texture_blur, weight_sum);
-
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 #endif /* __SUBSURFACE__ */
 
 /* Constant emission optimization */
 
-ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, float3 *eval)
+ccl_device bool shader_constant_emission_eval(const KernelGlobals *kg, int shader, float3 *eval)
 {
   int shader_index = shader & SHADER_MASK;
   int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
@@ -992,7 +505,7 @@ ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, flo
 
 /* Background */
 
-ccl_device float3 shader_background_eval(ShaderData *sd)
+ccl_device float3 shader_background_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return sd->closure_emission_background;
@@ -1004,7 +517,7 @@ ccl_device float3 shader_background_eval(ShaderData *sd)
 
 /* Emission */
 
-ccl_device float3 shader_emissive_eval(ShaderData *sd)
+ccl_device float3 shader_emissive_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
@@ -1016,7 +529,7 @@ ccl_device float3 shader_emissive_eval(ShaderData *sd)
 
 /* Holdout */
 
-ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_holdout_apply(const KernelGlobals *kg, ShaderData *sd)
 {
   float3 weight = zero_float3();
 
@@ -1041,7 +554,7 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
   }
   else {
     for (int i = 0; i < sd->num_closure; i++) {
-      ShaderClosure *sc = &sd->closure[i];
+      const ShaderClosure *sc = &sd->closure[i];
       if (CLOSURE_IS_HOLDOUT(sc->type)) {
         weight += sc->weight;
       }
@@ -1053,14 +566,12 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg,
-                                    ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    ccl_global float *buffer,
+template<uint node_feature_mask>
+ccl_device void shader_eval_surface(INTEGRATOR_STATE_CONST_ARGS,
+                                    ShaderData *ccl_restrict sd,
+                                    ccl_global float *ccl_restrict buffer,
                                     int path_flag)
 {
-  PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
-
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
    * shader data also do not have a closure array to save GPU memory. */
@@ -1069,7 +580,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   sd->num_closure = 0;
@@ -1078,17 +589,18 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #ifdef __OSL__
   if (kg->osl) {
     if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, sd, state, path_flag);
+      OSLShader::eval_background(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else {
-      OSLShader::eval_surface(kg, sd, state, path_flag);
+      OSLShader::eval_surface(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
   }
   else
 #endif
   {
 #ifdef __SVM__
-    svm_eval_nodes(kg, sd, state, buffer, SHADER_TYPE_SURFACE, path_flag);
+    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(
+        INTEGRATOR_STATE_PASS, sd, buffer, path_flag);
 #else
     if (sd->object == OBJECT_NONE) {
       sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
@@ -1105,8 +617,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #endif
   }
 
-  if (sd->flag & SD_BSDF_NEEDS_LCG) {
-    sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
+  if (KERNEL_NODES_FEATURE(BSDF) && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+    sd->lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                   INTEGRATOR_STATE(path, rng_offset),
+                                   INTEGRATOR_STATE(path, sample),
+                                   0xb4bc3953);
   }
 }
 
@@ -1114,48 +629,47 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 
 #ifdef __VOLUME__
 
-ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd,
-                                                       const float3 omega_in,
-                                                       float *pdf,
-                                                       int skip_phase,
-                                                       BsdfEval *result_eval,
-                                                       float sum_pdf,
-                                                       float sum_sample_weight)
+ccl_device_inline float _shader_volume_phase_multi_eval(const ShaderData *sd,
+                                                        const ShaderVolumePhases *phases,
+                                                        const float3 omega_in,
+                                                        int skip_phase,
+                                                        BsdfEval *result_eval,
+                                                        float sum_pdf,
+                                                        float sum_sample_weight)
 {
-  for (int i = 0; i < sd->num_closure; i++) {
+  for (int i = 0; i < phases->num_closure; i++) {
     if (i == skip_phase)
       continue;
 
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_PHASE(sc->type)) {
-      float phase_pdf = 0.0f;
-      float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
+    const ShaderVolumeClosure *svc = &phases->closure[i];
+    float phase_pdf = 0.0f;
+    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
 
-      if (phase_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
-        sum_pdf += phase_pdf * sc->sample_weight;
-      }
-
-      sum_sample_weight += sc->sample_weight;
+    if (phase_pdf != 0.0f) {
+      bsdf_eval_accum(result_eval, false, eval, 1.0f);
+      sum_pdf += phase_pdf * svc->sample_weight;
     }
+
+    sum_sample_weight += svc->sample_weight;
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
 
-ccl_device void shader_volume_phase_eval(
-    KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf)
+ccl_device float shader_volume_phase_eval(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
+                                          const float3 omega_in,
+                                          BsdfEval *phase_eval)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+  bsdf_eval_init(phase_eval, false, zero_float3());
 
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
-
-  _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
+  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
 }
 
-ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
+ccl_device int shader_volume_phase_sample(const KernelGlobals *kg,
                                           const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
                                           float randu,
                                           float randv,
                                           BsdfEval *phase_eval,
@@ -1163,41 +677,34 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int sampled = 0;
 
-  if (sd->num_closure > 1) {
+  if (phases->num_closure > 1) {
     /* pick a phase closure based on sample weights */
     float sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
-
-      if (CLOSURE_IS_PHASE(sc->type))
-        sum += sc->sample_weight;
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      sum += svc->sample_weight;
     }
 
     float r = randu * sum;
     float partial_sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      float next_sum = partial_sum + svc->sample_weight;
 
-      if (CLOSURE_IS_PHASE(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r <= next_sum) {
-          /* Rescale to reuse for BSDF direction sample. */
-          randu = (r - partial_sum) / sc->sample_weight;
-          break;
-        }
-
-        partial_sum = next_sum;
+      if (r <= next_sum) {
+        /* Rescale to reuse for BSDF direction sample. */
+        randu = (r - partial_sum) / svc->sample_weight;
+        break;
       }
+
+      partial_sum = next_sum;
     }
 
-    if (sampled == sd->num_closure) {
+    if (sampled == phases->num_closure) {
       *pdf = 0.0f;
       return LABEL_NONE;
     }
@@ -1205,23 +712,23 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
 
   /* todo: this isn't quite correct, we don't weight anisotropy properly
    * depending on color channels, even if this is perhaps not a common case */
-  const ShaderClosure *sc = &sd->closure[sampled];
+  const ShaderVolumeClosure *svc = &phases->closure[sampled];
   int label;
   float3 eval = zero_float3();
 
   *pdf = 0.0f;
-  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
   }
 
   return label;
 }
 
-ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
+ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,
                                            const ShaderData *sd,
-                                           const ShaderClosure *sc,
+                                           const ShaderVolumeClosure *sc,
                                            float randu,
                                            float randv,
                                            BsdfEval *phase_eval,
@@ -1229,8 +736,6 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
                                            differential3 *domega_in,
                                            float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int label;
   float3 eval = zero_float3();
 
@@ -1238,18 +743,18 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
   label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
 
   return label;
 }
 
 /* Volume Evaluation */
 
-ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          ccl_addr_space PathState *state,
-                                          ccl_addr_space VolumeStack *stack,
-                                          int path_flag)
+template<typename StackReadOp>
+ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
+                                          ShaderData *ccl_restrict sd,
+                                          const int path_flag,
+                                          StackReadOp stack_read)
 {
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
@@ -1259,7 +764,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   /* reset closures once at the start, we will be accumulating the closures
@@ -1268,14 +773,18 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
   sd->num_closure_left = max_closures;
   sd->flag = 0;
   sd->object_flag = 0;
-  sd->type = PRIMITIVE_VOLUME;
 
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
+  for (int i = 0;; i++) {
+    const VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
     /* setup shaderdata from stack. it's mostly setup already in
      * shader_setup_from_volume, this switching should be quick */
-    sd->object = stack[i].object;
+    sd->object = entry.object;
     sd->lamp = LAMP_NONE;
-    sd->shader = stack[i].shader;
+    sd->shader = entry.shader;
 
     sd->flag &= ~SD_SHADER_FLAGS;
     sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
@@ -1295,18 +804,19 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 #  ifdef __SVM__
 #    ifdef __OSL__
     if (kg->osl) {
-      OSLShader::eval_volume(kg, sd, state, path_flag);
+      OSLShader::eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else
 #    endif
     {
-      svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_VOLUME, path_flag);
+      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+          INTEGRATOR_STATE_PASS, sd, NULL, path_flag);
     }
 #  endif
 
-    /* merge closures to avoid exceeding number of closures limit */
+    /* Merge closures to avoid exceeding number of closures limit. */
     if (i > 0)
-      shader_merge_closures(sd);
+      shader_merge_volume_closures(sd);
   }
 }
 
@@ -1314,9 +824,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 /* Displacement Evaluation */
 
-ccl_device void shader_eval_displacement(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         ccl_addr_space PathState *state)
+ccl_device void shader_eval_displacement(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
   sd->num_closure = 0;
   sd->num_closure_left = 0;
@@ -1325,11 +833,12 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 #ifdef __SVM__
 #  ifdef __OSL__
   if (kg->osl)
-    OSLShader::eval_displacement(kg, sd, state);
+    OSLShader::eval_displacement(INTEGRATOR_STATE_PASS, sd);
   else
 #  endif
   {
-    svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_DISPLACEMENT, 0);
+    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+        INTEGRATOR_STATE_PASS, sd, NULL, 0);
   }
 #endif
 }
@@ -1337,29 +846,13 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 /* Transparent Shadows */
 
 #ifdef __TRANSPARENT_SHADOWS__
-ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
+ccl_device bool shader_transparent_shadow(const KernelGlobals *kg, Intersection *isect)
 {
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
-  int shader = 0;
-
-#  ifdef __HAIR__
-  if (isect->type & PRIMITIVE_ALL_TRIANGLE) {
-#  endif
-    shader = kernel_tex_fetch(__tri_shader, prim);
-#  ifdef __HAIR__
-  }
-  else {
-    float4 str = kernel_tex_fetch(__curves, prim);
-    shader = __float_as_int(str.z);
-  }
-#  endif
-  int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-  return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
+  return (intersection_get_shader_flags(kg, isect) & SD_HAS_TRANSPARENT_SHADOW) != 0;
 }
 #endif /* __TRANSPARENT_SHADOWS__ */
 
-ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+ccl_device float shader_cryptomatte_id(const KernelGlobals *kg, int shader)
 {
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
 }
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
deleted file mode 100644
index 3b124122fba..00000000000
--- a/intern/cycles/kernel/kernel_shadow.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME__
-/* Get PathState ready for use for volume stack evaluation. */
-#  ifdef __SPLIT_KERNEL__
-ccl_addr_space
-#  endif
-    ccl_device_inline PathState *
-    shadow_blocked_volume_path_state(KernelGlobals *kg,
-                                     VolumeState *volume_state,
-                                     ccl_addr_space PathState *state,
-                                     ShaderData *sd,
-                                     Ray *ray)
-{
-#  ifdef __SPLIT_KERNEL__
-  ccl_addr_space PathState *ps =
-      &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-#  else
-  PathState *ps = &volume_state->ps;
-#  endif
-  *ps = *state;
-  /* We are checking for shadow on the "other" side of the surface, so need
-   * to discard volume we are currently at.
-   */
-  if (dot(sd->Ng, ray->D) < 0.0f) {
-    kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
-  }
-  return ps;
-}
-#endif /* __VOLUME__ */
-
-/* Attenuate throughput accordingly to the given intersection event.
- * Returns true if the throughput is zero and traversal can be aborted.
- */
-ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg,
-                                                            ShaderData *shadow_sd,
-                                                            ccl_addr_space PathState *state,
-#ifdef __VOLUME__
-                                                            ccl_addr_space PathState *volume_state,
-#endif
-                                                            Intersection *isect,
-                                                            Ray *ray,
-                                                            float3 *throughput)
-{
-#ifdef __VOLUME__
-  /* Attenuation between last surface and next surface. */
-  if (volume_state->volume_stack[0].shader != SHADER_NONE) {
-    Ray segment_ray = *ray;
-    segment_ray.t = isect->t;
-    kernel_volume_shadow(kg, shadow_sd, volume_state, &segment_ray, throughput);
-  }
-#endif
-  /* Setup shader data at surface. */
-  shader_setup_from_ray(kg, shadow_sd, isect, ray);
-  /* Attenuation from transparent surface. */
-  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, shadow_sd, state, NULL, PATH_RAY_SHADOW);
-    path_state_modify_bounce(state, false);
-    *throughput *= shader_bsdf_transparency(kg, shadow_sd);
-  }
-  /* Stop if all light is blocked. */
-  if (is_zero(*throughput)) {
-    return true;
-  }
-#ifdef __VOLUME__
-  /* Exit/enter volume. */
-  kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
-#endif
-  return false;
-}
-
-/* Special version which only handles opaque shadows. */
-ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      const uint visibility,
-                                      Ray *ray,
-                                      Intersection *isect,
-                                      float3 *shadow)
-{
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-#ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-    kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
-  }
-#endif
-  return blocked;
-}
-
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-/* Shadow function to compute how much light is blocked,
- *
- * We trace a single ray. If it hits any opaque surface, or more than a given
- * number of transparent surfaces is hit, then we consider the geometry to be
- * entirely blocked. If not, all transparent surfaces will be recorded and we
- * will shade them one by one to determine how much light is blocked. This all
- * happens in one scene intersection function.
- *
- * Recording all hits works well in some cases but may be slower in others. If
- * we have many semi-transparent hairs, one intersection may be faster because
- * you'd be reinteresecting the same hairs a lot with each step otherwise. If
- * however there is mostly binary transparency then we may be recording many
- * unnecessary intersections when one of the first surfaces blocks all light.
- *
- * From tests in real scenes it seems the performance loss is either minimal,
- * or there is a performance increase anyway due to avoiding the need to send
- * two rays with transparent shadows.
- *
- * On CPU it'll handle all transparent bounces (by allocating storage for
- * intersections when they don't fit into the stack storage).
- *
- * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
- * is something to be kept an eye on.
- */
-
-#    define SHADOW_STACK_MAX_HITS 64
-
-/* Actual logic with traversal loop implementation which is free from device
- * specific tweaks.
- *
- * Note that hits array should be as big as max_hits+1.
- */
-ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    ShaderData *shadow_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    const uint visibility,
-                                                    Ray *ray,
-                                                    Intersection *hits,
-                                                    uint max_hits,
-                                                    float3 *shadow)
-{
-  /* Intersect to find an opaque surface, or record all transparent
-   * surface hits.
-   */
-  uint num_hits;
-  const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits);
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  /* If no opaque surface found but we did find transparent hits,
-   * shade them.
-   */
-  if (!blocked && num_hits > 0) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    float last_t = 0.0f;
-    int bounce = state->transparent_bounce;
-    Intersection *isect = hits;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    sort_intersections(hits, num_hits);
-    for (int hit = 0; hit < num_hits; hit++, isect++) {
-      /* Adjust intersection distance for moving ray forward. */
-      float new_t = isect->t;
-      isect->t -= last_t;
-      /* Skip hit if we did not move forward, step by step raytracing
-       * would have skipped it as well then.
-       */
-      if (last_t == new_t) {
-        continue;
-      }
-      last_t = new_t;
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = shadow_sd->P;
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow = throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-/* Here we do all device specific trickery before invoking actual traversal
- * loop to help readability of the actual logic.
- */
-ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *shadow_sd,
-                                               ccl_addr_space PathState *state,
-                                               const uint visibility,
-                                               Ray *ray,
-                                               uint max_hits,
-                                               float3 *shadow)
-{
-#    ifdef __SPLIT_KERNEL__
-  Intersection hits_[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = &hits_[0];
-#    elif defined(__KERNEL_CUDA__)
-  Intersection *hits = kg->hits_stack;
-#    else
-  Intersection hits_stack[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = hits_stack;
-#    endif
-#    ifndef __KERNEL_GPU__
-  /* Prefer to use stack but use dynamic allocation if too deep max hits
-   * we need max_hits + 1 storage space due to the logic in
-   * scene_intersect_shadow_all which will first store and then check if
-   * the limit is exceeded.
-   *
-   * Ignore this on GPU because of slow/unavailable malloc().
-   */
-  if (max_hits + 1 > SHADOW_STACK_MAX_HITS) {
-    if (kg->transparent_shadow_intersections == NULL) {
-      const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-      kg->transparent_shadow_intersections = (Intersection *)malloc(sizeof(Intersection) *
-                                                                    (transparent_max_bounce + 1));
-    }
-    hits = kg->transparent_shadow_intersections;
-  }
-#    endif /* __KERNEL_GPU__ */
-  /* Invoke actual traversal. */
-  return shadow_blocked_transparent_all_loop(
-      kg, sd, shadow_sd, state, visibility, ray, hits, max_hits, shadow);
-}
-#  endif /* __SHADOW_RECORD_ALL__ */
-
-#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
-/* Shadow function to compute how much light is blocked,
- *
- * Here we raytrace from one transparent surface to the next step by step.
- * To minimize overhead in cases where we don't need transparent shadows, we
- * first trace a regular shadow ray. We check if the hit primitive was
- * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency.
- */
-
-/* This function is only implementing device-independent traversal logic
- * which requires some precalculation done.
- */
-ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *shadow_sd,
-                                                        ccl_addr_space PathState *state,
-                                                        const uint visibility,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        const bool blocked,
-                                                        const bool is_transparent_isect,
-                                                        float3 *shadow)
-{
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  if (blocked && is_transparent_isect) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    int bounce = state->transparent_bounce;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    for (;;) {
-      if (bounce >= kernel_data.integrator.transparent_max_bounce) {
-        return true;
-      }
-      if (!scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect)) {
-        break;
-      }
-      if (!shader_transparent_shadow(kg, isect)) {
-        return true;
-      }
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow *= throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-ccl_device bool shadow_blocked_transparent_stepped(KernelGlobals *kg,
-                                                   ShaderData *sd,
-                                                   ShaderData *shadow_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   const uint visibility,
-                                                   Ray *ray,
-                                                   Intersection *isect,
-                                                   float3 *shadow)
-{
-  bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-  bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, isect) : false;
-  return shadow_blocked_transparent_stepped_loop(
-      kg, sd, shadow_sd, state, visibility, ray, isect, blocked, is_transparent_isect, shadow);
-}
-
-#  endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      Ray *ray,
-                                      float3 *shadow)
-{
-  *shadow = one_float3();
-#if !defined(__KERNEL_OPTIX__)
-  /* Some common early checks.
-   * Avoid conditional trace call in OptiX though, since those hurt performance there.
-   */
-  if (ray->t == 0.0f) {
-    return false;
-  }
-#endif
-#ifdef __SHADOW_TRICKS__
-  const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER :
-                                                                    PATH_RAY_SHADOW;
-#else
-  const uint visibility = PATH_RAY_SHADOW;
-#endif
-  /* Do actual shadow shading.
-   * First of all, we check if integrator requires transparent shadows.
-   * if not, we use simplest and fastest ever way to calculate occlusion.
-   * Do not do this in OptiX to avoid the additional trace call.
-   */
-#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__)
-  Intersection isect;
-#  ifdef __TRANSPARENT_SHADOWS__
-  if (!kernel_data.integrator.transparent_shadows)
-#  endif
-  {
-    return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow);
-  }
-#endif
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-  /* For the transparent shadows we try to use record-all logic on the
-   * devices which supports this.
-   */
-  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-  /* Check transparent bounces here, for volume scatter which can do
-   * lighting before surface path termination is checked.
-   */
-  if (state->transparent_bounce >= transparent_max_bounce) {
-    return true;
-  }
-  uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-#    if defined(__KERNEL_OPTIX__)
-  /* Always use record-all behavior in OptiX, but ensure there are no out of bounds
-   * accesses to the hit stack.
-   */
-  max_hits = min(max_hits, SHADOW_STACK_MAX_HITS - 1);
-#    elif defined(__KERNEL_GPU__)
-  /* On GPU we do tricky with tracing opaque ray first, this avoids speed
-   * regressions in some files.
-   *
-   * TODO(sergey): Check why using record-all behavior causes slowdown in such
-   * cases. Could that be caused by a higher spill pressure?
-   */
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect);
-  const bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, &isect) : false;
-  if (!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) {
-    return shadow_blocked_transparent_stepped_loop(
-        kg, sd, shadow_sd, state, visibility, ray, &isect, blocked, is_transparent_isect, shadow);
-  }
-#    endif /* __KERNEL_GPU__ */
-  return shadow_blocked_transparent_all(
-      kg, sd, shadow_sd, state, visibility, ray, max_hits, shadow);
-#  else  /* __SHADOW_RECORD_ALL__ */
-  /* Fallback to a slowest version which works on all devices. */
-  return shadow_blocked_transparent_stepped(
-      kg, sd, shadow_sd, state, visibility, ray, &isect, shadow);
-#  endif /* __SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-}
-
-#undef SHADOW_STACK_MAX_HITS
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shadow_catcher.h b/intern/cycles/kernel/kernel_shadow_catcher.h
new file mode 100644
index 00000000000..824749818a4
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shadow_catcher.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state_util.h"
+#include "kernel/kernel_path_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Check whether current surface bounce is where path is to be split for the shadow catcher. */
+ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_ARGS,
+                                                                  const int object_flag)
+{
+#ifdef __SHADOW_CATCHER__
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
+
+  /* Check the flag first, avoiding fetches form global memory. */
+  if ((object_flag & SD_OBJECT_SHADOW_CATCHER) == 0) {
+    return false;
+  }
+  if (object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if ((path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) == 0) {
+    /* Split only on primary rays, secondary bounces are to treat shadow catcher as a regular
+     * object. */
+    return false;
+  }
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    return false;
+  }
+
+  return true;
+#else
+  (void)object_flag;
+  return false;
+#endif
+}
+
+/* Check whether the current path can still split. */
+ccl_device_inline bool kernel_shadow_catcher_path_can_split(INTEGRATOR_STATE_CONST_ARGS)
+{
+  if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+    /* Shadow catcher was already hit and the state was split. No further split is allowed. */
+    return false;
+  }
+
+  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
+}
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(INTEGRATOR_STATE_ARGS, const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, object_flags)) {
+    return false;
+  }
+
+  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+   * shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Split new state from the current one. This new state will only track contribution of shadow
+   * catcher objects ignoring non-catcher objects. */
+  integrator_state_shadow_catcher_split(INTEGRATOR_STATE_PASS);
+
+  return true;
+#else
+  (void)object_flags;
+  return false;
+#endif
+}
+
+#ifdef __SHADOW_CATCHER__
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_HIT) == 0;
+}
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_PASS;
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
deleted file mode 100644
index 677504a4045..00000000000
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* BSSRDF using disk based importance sampling.
- *
- * BSSRDF Importance Sampling, SIGGRAPH 2013
- * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
- */
-
-ccl_device_inline float3
-subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, float r, bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  float3 eval_sum = zero_float3();
-  float pdf_sum = 0.0f;
-  float sample_weight_inv = 0.0f;
-
-  if (!all) {
-    float sample_weight_sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-        sample_weight_sum += sc->sample_weight;
-      }
-    }
-
-    sample_weight_inv = 1.0f / sample_weight_sum;
-  }
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    sc = &sd->closure[i];
-
-    if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-      /* in case of branched path integrate we sample all bssrdf's once,
-       * for path trace we pick one, so adjust pdf for that */
-      float sample_weight = (all) ? 1.0f : sc->sample_weight * sample_weight_inv;
-
-      /* compute pdf */
-      float3 eval = bssrdf_eval(sc, r);
-      float pdf = bssrdf_pdf(sc, disk_r);
-
-      eval_sum += sc->weight * eval;
-      pdf_sum += sample_weight * pdf;
-    }
-  }
-
-  return (pdf_sum > 0.0f) ? eval_sum / pdf_sum : zero_float3();
-}
-
-ccl_device_inline float3 subsurface_scatter_walk_eval(ShaderData *sd,
-                                                      const ShaderClosure *sc,
-                                                      float3 throughput,
-                                                      bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  if (!all) {
-    float bssrdf_weight = 0.0f;
-    float weight = sc->sample_weight;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSSRDF(sc->type)) {
-        bssrdf_weight += sc->sample_weight;
-      }
-    }
-    throughput *= bssrdf_weight / weight;
-  }
-  return throughput;
-}
-
-/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(
-    KernelGlobals *kg, ShaderData *sd, ClosureType type, float roughness, float3 weight, float3 N)
-{
-  sd->flag &= ~SD_CLOSURE_FLAGS;
-  sd->num_closure = 0;
-  sd->num_closure_left = kernel_data.integrator.max_closures;
-
-#ifdef __PRINCIPLED__
-  if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
-        sd, sizeof(PrincipledDiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      bsdf->roughness = roughness;
-      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular Disney principled diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
-    }
-  }
-  else if (CLOSURE_IS_BSDF_BSSRDF(type) || CLOSURE_IS_BSSRDF(type))
-#endif /* __PRINCIPLED__ */
-  {
-    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      sd->flag |= bsdf_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
-    }
-  }
-}
-
-/* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */
-ccl_device float3 subsurface_color_pow(float3 color, float exponent)
-{
-  color = max(color, zero_float3());
-
-  if (exponent == 1.0f) {
-    /* nothing to do */
-  }
-  else if (exponent == 0.5f) {
-    color.x = sqrtf(color.x);
-    color.y = sqrtf(color.y);
-    color.z = sqrtf(color.z);
-  }
-  else {
-    color.x = powf(color.x, exponent);
-    color.y = powf(color.y, exponent);
-    color.z = powf(color.z, exponent);
-  }
-
-  return color;
-}
-
-ccl_device void subsurface_color_bump_blur(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float3 *eval, float3 *N)
-{
-  /* average color and texture blur at outgoing point */
-  float texture_blur;
-  float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur);
-
-  /* do we have bump mapping? */
-  bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
-
-  if (bump || texture_blur > 0.0f) {
-    /* average color and normal at incoming point */
-    shader_eval_surface(kg, sd, state, NULL, state->flag);
-    float3 in_color = shader_bssrdf_sum(sd, (bump) ? N : NULL, NULL);
-
-    /* we simply divide out the average color and multiply with the average
-     * of the other one. we could try to do this per closure but it's quite
-     * tricky to match closures between shader evaluations, their number and
-     * order may change, this is simpler */
-    if (texture_blur > 0.0f) {
-      out_color = subsurface_color_pow(out_color, texture_blur);
-      in_color = subsurface_color_pow(in_color, texture_blur);
-
-      *eval *= safe_divide_color(in_color, out_color);
-    }
-  }
-}
-
-/* Subsurface scattering step, from a point on the surface to other
- * nearby points on the same object.
- */
-ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
-                                              LocalIntersection *ss_isect,
-                                              ShaderData *sd,
-                                              const ShaderClosure *sc,
-                                              uint *lcg_state,
-                                              float disk_u,
-                                              float disk_v,
-                                              bool all)
-{
-  /* pick random axis in local frame and point on disk */
-  float3 disk_N, disk_T, disk_B;
-  float pick_pdf_N, pick_pdf_T, pick_pdf_B;
-
-  disk_N = sd->Ng;
-  make_orthonormals(disk_N, &disk_T, &disk_B);
-
-  if (disk_v < 0.5f) {
-    pick_pdf_N = 0.5f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.25f;
-    disk_v *= 2.0f;
-  }
-  else if (disk_v < 0.75f) {
-    float3 tmp = disk_N;
-    disk_N = disk_T;
-    disk_T = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.5f;
-    pick_pdf_B = 0.25f;
-    disk_v = (disk_v - 0.5f) * 4.0f;
-  }
-  else {
-    float3 tmp = disk_N;
-    disk_N = disk_B;
-    disk_B = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.5f;
-    disk_v = (disk_v - 0.75f) * 4.0f;
-  }
-
-  /* sample point on disk */
-  float phi = M_2PI_F * disk_v;
-  float disk_height, disk_r;
-
-  bssrdf_sample(sc, disk_u, &disk_r, &disk_height);
-
-  float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
-
-  /* create ray */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = sd->P + disk_N * disk_height + disk_P;
-  ray->D = -disk_N;
-  ray->t = 2.0f * disk_height;
-  ray->dP = sd->dP;
-  ray->dD = differential3_zero();
-  ray->time = sd->time;
-
-  /* intersect with the same object. if multiple intersections are found it
-   * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
-  scene_intersect_local(kg, ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
-  int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
-
-  for (int hit = 0; hit < num_eval_hits; hit++) {
-    /* Quickly retrieve P and Ng without setting up ShaderData. */
-    float3 hit_P;
-    if (sd->type & PRIMITIVE_TRIANGLE) {
-      hit_P = triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray);
-    }
-#ifdef __OBJECT_MOTION__
-    else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
-      float3 verts[3];
-      motion_triangle_vertices(kg,
-                               sd->object,
-                               kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-                               sd->time,
-                               verts);
-      hit_P = motion_triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray, verts);
-    }
-#endif /* __OBJECT_MOTION__ */
-    else {
-      ss_isect->weight[hit] = zero_float3();
-      continue;
-    }
-
-    float3 hit_Ng = ss_isect->Ng[hit];
-    if (ss_isect->hits[hit].object != OBJECT_NONE) {
-      object_normal_transform(kg, sd, &hit_Ng);
-    }
-
-    /* Probability densities for local frame axes. */
-    float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
-    float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
-    float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
-
-    /* Multiple importance sample between 3 axes, power heuristic
-     * found to be slightly better than balance heuristic. pdf_N
-     * in the MIS weight and denominator cancelled out. */
-    float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
-    if (ss_isect->num_hits > BSSRDF_MAX_HITS) {
-      w *= ss_isect->num_hits / (float)BSSRDF_MAX_HITS;
-    }
-
-    /* Real distance to sampled point. */
-    float r = len(hit_P - sd->P);
-
-    /* Evaluate profiles. */
-    float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
-
-    ss_isect->weight[hit] = eval;
-  }
-
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return num_eval_hits;
-}
-
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg,
-                                                      LocalIntersection *ss_isect,
-                                                      int hit,
-                                                      ShaderData *sd,
-                                                      ccl_addr_space PathState *state,
-                                                      ClosureType type,
-                                                      float roughness)
-{
-  optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness);
-}
-extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup(
-#else
-ccl_device_noinline void subsurface_scatter_multi_setup(
-#endif
-    KernelGlobals *kg,
-    LocalIntersection *ss_isect,
-    int hit,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ClosureType type,
-    float roughness)
-{
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-
-  /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
-#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
-  kernel_split_params.dummy_sd_flag = sd->flag;
-#endif
-
-  /* Setup new shading point. */
-  shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
-
-  /* Optionally blur colors and bump mapping. */
-  float3 weight = ss_isect->weight[hit];
-  float3 N = sd->N;
-  subsurface_color_bump_blur(kg, sd, state, &weight, &N);
-
-  /* Setup diffuse BSDF. */
-  subsurface_scatter_setup_diffuse_bsdf(kg, sd, type, roughness, weight, N);
-}
-
-/* Random walk subsurface scattering.
- *
- * "Practical and Controllable Subsurface Scattering for Production Path
- *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-
-ccl_device void subsurface_random_walk_remap(const float A,
-                                             const float d,
-                                             float *sigma_t,
-                                             float *alpha)
-{
-  /* Compute attenuation and scattering coefficients from albedo. */
-  *alpha = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f)));
-  const float s = 1.9f - A + 3.5f * sqr(A - 0.8f);
-
-  *sigma_t = 1.0f / fmaxf(d * s, 1e-16f);
-}
-
-ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc,
-                                                    float3 *sigma_t,
-                                                    float3 *alpha,
-                                                    float3 *weight)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  const float3 A = bssrdf->albedo;
-  const float3 d = bssrdf->radius;
-  float sigma_t_x, sigma_t_y, sigma_t_z;
-  float alpha_x, alpha_y, alpha_z;
-
-  subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &alpha_x);
-  subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &alpha_y);
-  subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &alpha_z);
-
-  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
-  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
-
-  /* Closure mixing and Fresnel weights separate from albedo. */
-  *weight = safe_divide_color(bssrdf->weight, A);
-}
-
-/* References for Dwivedi sampling:
- *
- * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
- * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
- * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
- *
- * [2] "Improving the Dwivedi Sampling Scheme"
- * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
- * https://cg.ivd.kit.edu/1951.php
- *
- * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
- * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
- * https://iliyan.com/publications/RenderingCourse2020
- */
-
-ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
-{
-  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
-  return 1.0f / ((v - cos_theta) * phase_log);
-}
-
-ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
-{
-  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
-   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
-   * we can implement the power function like this. */
-  return v - (v + 1) * expf(-rand * phase_log);
-}
-
-ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
-{
-  /* Eq. 67 from [3] */
-  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
-}
-
-ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
-{
-  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
-  float phi = M_2PI_F * randv;
-  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
-
-  float3 T, B;
-  make_orthonormals(D, &T, &B);
-  return dir.x * T + dir.y * B + dir.z * D;
-}
-
-ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
-                                                         float t,
-                                                         bool hit,
-                                                         float3 *transmittance)
-{
-  float3 T = volume_color_transmittance(sigma_t, t);
-  if (transmittance) {
-    *transmittance = T;
-  }
-  return hit ? T : sigma_t * T;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    bool
-    subsurface_random_walk(KernelGlobals *kg,
-                           LocalIntersection *ss_isect,
-                           ShaderData *sd,
-                           ccl_addr_space PathState *state,
-                           const ShaderClosure *sc,
-                           const float bssrdf_u,
-                           const float bssrdf_v,
-                           bool all)
-{
-  /* Sample diffuse surface scatter into the object. */
-  float3 D;
-  float pdf;
-  sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf);
-  if (dot(-sd->Ng, D) <= 0.0f) {
-    return 0;
-  }
-
-  /* Convert subsurface to volume coefficients.
-   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
-  float3 sigma_t, alpha;
-  float3 throughput = one_float3();
-  subsurface_random_walk_coefficients(sc, &sigma_t, &alpha, &throughput);
-  float3 sigma_s = sigma_t * alpha;
-
-  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
-   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
-   * for making the code significantly more complex and slower (if direction sampling depends on
-   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
-   *
-   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
-   * is too low results in fireflies while one that's too high just gives a bit more noise.
-   * Therefore, the code here uses the highest of the three albedos to be safe. */
-  float diffusion_length = diffusion_length_dwivedi(max3(alpha));
-  /* Precompute term for phase sampling. */
-  float phase_log = logf((diffusion_length + 1) / (diffusion_length - 1));
-
-  /* Setup ray. */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->D = D;
-  ray->t = FLT_MAX;
-  ray->time = sd->time;
-
-  /* Modify state for RNGs, decorrelated from other paths. */
-  uint prev_rng_offset = state->rng_offset;
-  uint prev_rng_hash = state->rng_hash;
-  state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef);
-
-  /* Random walk until we hit the surface again. */
-  bool hit = false;
-  bool have_opposite_interface = false;
-  float opposite_distance = 0.0f;
-
-  /* Todo: Disable for alpha>0.999 or so? */
-  const float guided_fraction = 0.75f;
-
-  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
-    /* Advance random number offset. */
-    state->rng_offset += PRNG_BOUNCE_NUM;
-
-    /* Sample color channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
-    float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-    float randt = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    /* We need the result of the raycast to compute the full guided PDF, so just remember the
-     * relevant terms to avoid recomputing them later. */
-    float backward_fraction = 0.0f;
-    float forward_pdf_factor = 0.0f;
-    float forward_stretching = 1.0f;
-    float backward_pdf_factor = 0.0f;
-    float backward_stretching = 1.0f;
-
-    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
-    if (bounce > 0) {
-      /* Decide whether we should use guided or classic sampling. */
-      bool guided = (path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE) < guided_fraction);
-
-      /* Determine if we want to sample away from the incoming interface.
-       * This only happens if we found a nearby opposite interface, and the probability for it
-       * depends on how close we are to it already.
-       * This probability term comes from the recorded presentation of [3]. */
-      bool guide_backward = false;
-      if (have_opposite_interface) {
-        /* Compute distance of the random walk between the tangent plane at the starting point
-         * and the assumed opposite interface (the parallel plane that contains the point we
-         * found in our ray query for the opposite side). */
-        float x = clamp(dot(ray->P - sd->P, -sd->N), 0.0f, opposite_distance);
-        backward_fraction = 1.0f / (1.0f + expf((opposite_distance - 2 * x) / diffusion_length));
-        guide_backward = path_state_rng_1D(kg, state, PRNG_TERMINATE) < backward_fraction;
-      }
-
-      /* Sample scattering direction. */
-      float scatter_u, scatter_v;
-      path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v);
-      float cos_theta;
-      if (guided) {
-        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
-        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
-         * sign here is enough to sample from that instead. */
-        if (guide_backward) {
-          cos_theta = -cos_theta;
-        }
-      }
-      else {
-        cos_theta = 2.0f * scatter_u - 1.0f;
-      }
-      ray->D = direction_from_cosine(sd->N, cos_theta, scatter_v);
-
-      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
-       * Since phase sampling is channel-independent, we can get away with applying a factor
-       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
-       * it cancel with an equivalent term in the numerator of the full estimator.
-       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
-       */
-      forward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta);
-      backward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta);
-
-      /* Prepare distance sampling.
-       * For the backwards case, this also needs the sign swapped since now directions against
-       * sd->N (and therefore with negative cos_theta) are preferred. */
-      forward_stretching = (1.0f - cos_theta / diffusion_length);
-      backward_stretching = (1.0f + cos_theta / diffusion_length);
-      if (guided) {
-        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
-      }
-    }
-
-    /* Sample direction along ray. */
-    float t = -logf(1.0f - randt) / sample_sigma_t;
-
-    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
-     * If yes, we will later use backwards guided sampling in order to have a decent
-     * chance of connecting to it.
-     * Todo: Maybe use less than 10 times the mean free path? */
-    ray->t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
-    scene_intersect_local(kg, ray, ss_isect, sd->object, NULL, 1);
-    hit = (ss_isect->num_hits > 0);
-
-    if (hit) {
-#ifdef __KERNEL_OPTIX__
-      /* t is always in world space with OptiX. */
-      ray->t = ss_isect->hits[0].t;
-#else
-      /* Compute world space distance to surface hit. */
-      float3 D = ray->D;
-      object_inverse_dir_transform(kg, sd, &D);
-      D = normalize(D) * ss_isect->hits[0].t;
-      object_dir_transform(kg, sd, &D);
-      ray->t = len(D);
-#endif
-    }
-
-    if (bounce == 0) {
-      /* Check if we hit the opposite side. */
-      if (hit) {
-        have_opposite_interface = true;
-        opposite_distance = dot(ray->P + ray->t * ray->D - sd->P, -sd->N);
-      }
-      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
-       * so check if there would have been a hit in that case. */
-      hit = ray->t < t;
-    }
-
-    /* Use the distance to the exit point for the throughput update if we found one. */
-    if (hit) {
-      t = ray->t;
-    }
-    else if (bounce == 0) {
-      /* Restore original position if nothing was hit after the first bounce,
-       * without the ray_offset() that was added to avoid self-intersection.
-       * Otherwise if that offset is relatively large compared to the scattering
-       * radius, we never go back up high enough to exit the surface. */
-      ray->P = sd->P;
-    }
-
-    /* Advance to new scatter location. */
-    ray->P += t * ray->D;
-
-    float3 transmittance;
-    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
-    if (bounce > 0) {
-      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
-      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
-
-      if (have_opposite_interface) {
-        /* First step of MIS: Depending on geometry we might have two methods for guided
-         * sampling, so perform MIS between them. */
-        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
-        guided_pdf = mix(
-            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
-      }
-      else {
-        /* Just include phase sampling factor otherwise. */
-        guided_pdf *= forward_pdf_factor;
-      }
-
-      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
-      pdf = mix(pdf, guided_pdf, guided_fraction);
-    }
-
-    /* Finally, we're applying MIS again to combine the three color channels.
-     * Altogether, the MIS computation combines up to nine different estimators:
-     * {classic, guided, backward_guided} x {r, g, b} */
-    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
-
-    if (hit) {
-      /* If we hit the surface, we are done. */
-      break;
-    }
-    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
-      /* Avoid unnecessary work and precision issue when throughput gets really small. */
-      break;
-    }
-  }
-
-  kernel_assert(isfinite_safe(throughput.x) && isfinite_safe(throughput.y) &&
-                isfinite_safe(throughput.z));
-
-  state->rng_offset = prev_rng_offset;
-  state->rng_hash = prev_rng_hash;
-
-  /* Return number of hits in ss_isect. */
-  if (!hit) {
-    return 0;
-  }
-
-  /* TODO: gain back performance lost from merging with disk BSSRDF. We
-   * only need to return on hit so this indirect ray push/pop overhead
-   * is not actually needed, but it does keep the code simpler. */
-  ss_isect->weight[0] = subsurface_scatter_walk_eval(sd, sc, throughput, all);
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return 1;
-}
-
-ccl_device_inline int subsurface_scatter_multi_intersect(KernelGlobals *kg,
-                                                         LocalIntersection *ss_isect,
-                                                         ShaderData *sd,
-                                                         ccl_addr_space PathState *state,
-                                                         const ShaderClosure *sc,
-                                                         uint *lcg_state,
-                                                         float bssrdf_u,
-                                                         float bssrdf_v,
-                                                         bool all)
-{
-  if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-    return subsurface_scatter_disk(kg, ss_isect, sd, sc, lcg_state, bssrdf_u, bssrdf_v, all);
-  }
-  else {
-    return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v, all);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index c8e01677d09..bf9b94c1753 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -78,7 +78,7 @@ KERNEL_TEX(KernelShader, __shaders)
 KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, __sample_pattern_lut)
+KERNEL_TEX(float, __sample_pattern_lut)
 
 /* image textures */
 KERNEL_TEX(TextureInfo, __texture_info)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 7cbe18acf28..927e60e8729 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_TYPES_H__
-#define __KERNEL_TYPES_H__
+#pragma once
 
 #if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
 #  include <embree3/rtcore.h>
@@ -60,27 +59,9 @@ CCL_NAMESPACE_BEGIN
 #define PRIM_NONE (~0)
 #define LAMP_NONE (~0)
 #define ID_NONE (0.0f)
+#define PASS_UNUSED (~0)
 
-#define VOLUME_STACK_SIZE 32
-
-/* Split kernel constants */
-#define WORK_POOL_SIZE_GPU 64
-#define WORK_POOL_SIZE_CPU 1
-#ifdef __KERNEL_GPU__
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_GPU
-#else
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
-#endif
-
-#define SHADER_SORT_BLOCK_SIZE 2048
-
-#ifdef __KERNEL_OPENCL__
-#  define SHADER_SORT_LOCAL_SIZE 64
-#elif defined(__KERNEL_CUDA__)
-#  define SHADER_SORT_LOCAL_SIZE 32
-#else
-#  define SHADER_SORT_LOCAL_SIZE 1
-#endif
+#define VOLUME_STACK_SIZE 4
 
 /* Kernel features */
 #define __SOBOL__
@@ -93,7 +74,7 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
-#define __SHADOW_TRICKS__
+#define __SHADOW_CATCHER__
 #define __DENOISING_FEATURES__
 #define __SHADER_RAYTRACE__
 #define __AO__
@@ -102,7 +83,6 @@ CCL_NAMESPACE_BEGIN
 #define __SVM__
 #define __EMISSION__
 #define __HOLDOUT__
-#define __MULTI_CLOSURE__
 #define __TRANSPARENT_SHADOWS__
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
@@ -112,7 +92,6 @@ CCL_NAMESPACE_BEGIN
 #define __PRINCIPLED__
 #define __SUBSURFACE__
 #define __VOLUME__
-#define __VOLUME_SCATTER__
 #define __CMJ__
 #define __SHADOW_RECORD_ALL__
 #define __BRANCHED_PATH__
@@ -122,106 +101,60 @@ CCL_NAMESPACE_BEGIN
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_RECORD_ALL__
 #endif /* __KERNEL_CPU__ */
 
-#ifdef __KERNEL_CUDA__
-#  ifdef __SPLIT_KERNEL__
-#    undef __BRANCHED_PATH__
-#  endif
-#endif /* __KERNEL_CUDA__ */
-
 #ifdef __KERNEL_OPTIX__
 #  undef __BAKING__
-#  undef __BRANCHED_PATH__
 #endif /* __KERNEL_OPTIX__ */
 
-#ifdef __KERNEL_OPENCL__
-#endif /* __KERNEL_OPENCL__ */
-
 /* Scene-based selective features compilation. */
-#ifdef __NO_CAMERA_MOTION__
-#  undef __CAMERA_MOTION__
-#endif
-#ifdef __NO_OBJECT_MOTION__
-#  undef __OBJECT_MOTION__
-#endif
-#ifdef __NO_HAIR__
-#  undef __HAIR__
-#endif
-#ifdef __NO_VOLUME__
-#  undef __VOLUME__
-#  undef __VOLUME_SCATTER__
-#endif
-#ifdef __NO_SUBSURFACE__
-#  undef __SUBSURFACE__
-#endif
-#ifdef __NO_BAKING__
-#  undef __BAKING__
-#endif
-#ifdef __NO_BRANCHED_PATH__
-#  undef __BRANCHED_PATH__
-#endif
-#ifdef __NO_PATCH_EVAL__
-#  undef __PATCH_EVAL__
-#endif
-#ifdef __NO_TRANSPARENT__
-#  undef __TRANSPARENT_SHADOWS__
-#endif
-#ifdef __NO_SHADOW_TRICKS__
-#  undef __SHADOW_TRICKS__
-#endif
-#ifdef __NO_PRINCIPLED__
-#  undef __PRINCIPLED__
-#endif
-#ifdef __NO_DENOISING__
-#  undef __DENOISING_FEATURES__
-#endif
-#ifdef __NO_SHADER_RAYTRACE__
-#  undef __SHADER_RAYTRACE__
+#ifdef __KERNEL_FEATURES__
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_CAMERA_MOTION)
+#    undef __CAMERA_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_OBJECT_MOTION)
+#    undef __OBJECT_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_HAIR)
+#    undef __HAIR__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_VOLUME)
+#    undef __VOLUME__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE)
+#    undef __SUBSURFACE__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING)
+#    undef __BAKING__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION)
+#    undef __PATCH_EVAL__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_TRANSPARENT)
+#    undef __TRANSPARENT_SHADOWS__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SHADOW_CATCHER)
+#    undef __SHADOW_CATCHER__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PRINCIPLED)
+#    undef __PRINCIPLED__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_DENOISING)
+#    undef __DENOISING_FEATURES__
+#  endif
 #endif
 
 #ifdef WITH_CYCLES_DEBUG_NAN
 #  define __KERNEL_DEBUG_NAN__
 #endif
 
+/* Features that enable others */
+
 #if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__)
 #  define __BVH_LOCAL__
 #endif
 
-/* Shader Evaluation */
-
-typedef enum ShaderEvalType {
-  SHADER_EVAL_DISPLACE,
-  SHADER_EVAL_BACKGROUND,
-  /* bake types */
-  SHADER_EVAL_BAKE, /* no real shade, it's used in the code to
-                     * differentiate the type of shader eval from the above
-                     */
-  /* data passes */
-  SHADER_EVAL_NORMAL,
-  SHADER_EVAL_UV,
-  SHADER_EVAL_ROUGHNESS,
-  SHADER_EVAL_DIFFUSE_COLOR,
-  SHADER_EVAL_GLOSSY_COLOR,
-  SHADER_EVAL_TRANSMISSION_COLOR,
-  SHADER_EVAL_EMISSION,
-  SHADER_EVAL_AOV_COLOR,
-  SHADER_EVAL_AOV_VALUE,
-
-  /* light passes */
-  SHADER_EVAL_AO,
-  SHADER_EVAL_COMBINED,
-  SHADER_EVAL_SHADOW,
-  SHADER_EVAL_DIFFUSE,
-  SHADER_EVAL_GLOSSY,
-  SHADER_EVAL_TRANSMISSION,
-
-  /* extra */
-  SHADER_EVAL_ENVIRONMENT,
-} ShaderEvalType;
-
 /* Path Tracing
  * note we need to keep the u/v pairs at even values */
 
@@ -252,8 +185,7 @@ enum PathTraceDimension {
 
 enum SamplingPattern {
   SAMPLING_PATTERN_SOBOL = 0,
-  SAMPLING_PATTERN_CMJ = 1,
-  SAMPLING_PATTERN_PMJ = 2,
+  SAMPLING_PATTERN_PMJ = 1,
 
   SAMPLING_NUM_PATTERNS,
 };
@@ -261,7 +193,12 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-  /* Ray visibility. */
+  /* --------------------------------------------------------------------
+   * Ray visibility.
+   *
+   * NOTE: Recalculated after a surface bounce.
+   */
+
   PATH_RAY_CAMERA = (1 << 0),
   PATH_RAY_REFLECT = (1 << 1),
   PATH_RAY_TRANSMIT = (1 << 2),
@@ -269,57 +206,106 @@ enum PathRayFlag {
   PATH_RAY_GLOSSY = (1 << 4),
   PATH_RAY_SINGULAR = (1 << 5),
   PATH_RAY_TRANSPARENT = (1 << 6),
+  PATH_RAY_VOLUME_SCATTER = (1 << 7),
 
   /* Shadow ray visibility. */
-  PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7),
-  PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8),
-  PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | PATH_RAY_SHADOW_OPAQUE_CATCHER),
-  PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9),
-  PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10),
-  PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
-  PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+  PATH_RAY_SHADOW_OPAQUE = (1 << 8),
+  PATH_RAY_SHADOW_TRANSPARENT = (1 << 9),
   PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE | PATH_RAY_SHADOW_TRANSPARENT),
 
-  /* Unused, free to reuse. */
-  PATH_RAY_UNUSED = (1 << 11),
+  /* Special flag to tag unaligned BVH nodes.
+   * Only set and used in BVH nodes to distinguish how to interpret bounding box information stored
+   * in the node (either it should be intersected as AABB or as OBB). */
+  PATH_RAY_NODE_UNALIGNED = (1 << 10),
 
-  /* Ray visibility for volume scattering. */
-  PATH_RAY_VOLUME_SCATTER = (1 << 12),
-
-  /* Special flag to tag unaligned BVH nodes. */
-  PATH_RAY_NODE_UNALIGNED = (1 << 13),
+  /* Subset of flags used for ray visibility for intersection.
+   *
+   * NOTE: SHADOW_CATCHER macros below assume there are no more than
+   * 16 visibility bits. */
+  PATH_RAY_ALL_VISIBILITY = ((1 << 11) - 1),
 
-  PATH_RAY_ALL_VISIBILITY = ((1 << 14) - 1),
+  /* --------------------------------------------------------------------
+   * Path flags.
+   */
 
   /* Don't apply multiple importance sampling weights to emission from
    * lamp or surface hits, because they were not direct light sampled. */
-  PATH_RAY_MIS_SKIP = (1 << 14),
+  PATH_RAY_MIS_SKIP = (1 << 11),
+
   /* Diffuse bounce earlier in the path, skip SSS to improve performance
    * and avoid branching twice with disk sampling SSS. */
-  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 15),
+  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 12),
+
   /* Single pass has been written. */
-  PATH_RAY_SINGLE_PASS_DONE = (1 << 16),
-  /* Ray is behind a shadow catcher. */
-  PATH_RAY_SHADOW_CATCHER = (1 << 17),
-  /* Store shadow data for shadow catcher or denoising. */
-  PATH_RAY_STORE_SHADOW_INFO = (1 << 18),
+  PATH_RAY_SINGLE_PASS_DONE = (1 << 13),
+
   /* Zero background alpha, for camera or transparent glass rays. */
-  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 19),
+  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 14),
+
   /* Terminate ray immediately at next bounce. */
-  PATH_RAY_TERMINATE_IMMEDIATE = (1 << 20),
+  PATH_RAY_TERMINATE_ON_NEXT_SURFACE = (1 << 15),
+  PATH_RAY_TERMINATE_IN_NEXT_VOLUME = (1 << 16),
+
   /* Ray is to be terminated, but continue with transparent bounces and
    * emission as long as we encounter them. This is required to make the
    * MIS between direct and indirect light rays match, as shadow rays go
    * through transparent surfaces to reach emission too. */
-  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21),
+  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 17),
+
+  /* Terminate ray immediately after volume shading. */
+  PATH_RAY_TERMINATE_AFTER_VOLUME = (1 << 18),
+
   /* Ray is to be terminated. */
-  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE | PATH_RAY_TERMINATE_AFTER_TRANSPARENT),
+  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_ON_NEXT_SURFACE | PATH_RAY_TERMINATE_IN_NEXT_VOLUME |
+                        PATH_RAY_TERMINATE_AFTER_TRANSPARENT | PATH_RAY_TERMINATE_AFTER_VOLUME),
+
   /* Path and shader is being evaluated for direct lighting emission. */
-  PATH_RAY_EMISSION = (1 << 22)
+  PATH_RAY_EMISSION = (1 << 19),
+
+  /* Perform subsurface scattering. */
+  PATH_RAY_SUBSURFACE = (1 << 20),
+
+  /* Contribute to denoising features. */
+  PATH_RAY_DENOISING_FEATURES = (1 << 21),
+
+  /* Render pass categories. */
+  PATH_RAY_REFLECT_PASS = (1 << 22),
+  PATH_RAY_TRANSMISSION_PASS = (1 << 23),
+  PATH_RAY_VOLUME_PASS = (1 << 24),
+  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
+
+  /* Shadow ray is for a light or surface. */
+  PATH_RAY_SHADOW_FOR_LIGHT = (1 << 25),
+
+  /* A shadow catcher object was hit and the path was split into two. */
+  PATH_RAY_SHADOW_CATCHER_HIT = (1 << 26),
+
+  /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
+   * their dedicated pass for later division.
+   *
+   * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
+   * which is separate from the light passes. */
+  PATH_RAY_SHADOW_CATCHER_PASS = (1 << 27),
+
+  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1 << 28),
 };
 
+/* Configure ray visibility bits for rays and objects respectively,
+ * to make shadow catchers work.
+ *
+ * On shadow catcher paths we want to ignore any intersections with non-catchers,
+ * whereas on regular paths we want to intersect all objects. */
+
+#define SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) ((visibility) << 16)
+
+#define SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility) \
+  (((path_flag)&PATH_RAY_SHADOW_CATCHER_PASS) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : \
+                                                (visibility))
+
+#define SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility) \
+  (((is_shadow_catcher) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : 0) | (visibility))
+
 /* Closure Label */
 
 typedef enum ClosureLabel {
@@ -332,6 +318,7 @@ typedef enum ClosureLabel {
   LABEL_TRANSPARENT = 32,
   LABEL_VOLUME_SCATTER = 64,
   LABEL_TRANSMIT_TRANSPARENT = 128,
+  LABEL_SUBSURFACE_SCATTER = 256,
 } ClosureLabel;
 
 /* Render Passes */
@@ -339,17 +326,35 @@ typedef enum ClosureLabel {
 #define PASS_NAME_JOIN(a, b) a##_##b
 #define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32))
 
-#define PASSMASK_COMPONENT(comp) \
-  (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) | PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \
-   PASSMASK(PASS_NAME_JOIN(comp, COLOR)))
-
+// NOTE: Keep in sync with `Pass::get_type_enum()`.
 typedef enum PassType {
   PASS_NONE = 0,
 
-  /* Main passes */
+  /* Light Passes */
   PASS_COMBINED = 1,
-  PASS_DEPTH,
+  PASS_EMISSION,
+  PASS_BACKGROUND,
+  PASS_AO,
+  PASS_SHADOW,
+  PASS_DIFFUSE,
+  PASS_DIFFUSE_DIRECT,
+  PASS_DIFFUSE_INDIRECT,
+  PASS_GLOSSY,
+  PASS_GLOSSY_DIRECT,
+  PASS_GLOSSY_INDIRECT,
+  PASS_TRANSMISSION,
+  PASS_TRANSMISSION_DIRECT,
+  PASS_TRANSMISSION_INDIRECT,
+  PASS_VOLUME,
+  PASS_VOLUME_DIRECT,
+  PASS_VOLUME_INDIRECT,
+  PASS_CATEGORY_LIGHT_END = 31,
+
+  /* Data passes */
+  PASS_DEPTH = 32,
+  PASS_POSITION,
   PASS_NORMAL,
+  PASS_ROUGHNESS,
   PASS_UV,
   PASS_OBJECT_ID,
   PASS_MATERIAL_ID,
@@ -361,31 +366,35 @@ typedef enum PassType {
   PASS_AOV_VALUE,
   PASS_ADAPTIVE_AUX_BUFFER,
   PASS_SAMPLE_COUNT,
-  PASS_CATEGORY_MAIN_END = 31,
-
-  PASS_MIST = 32,
-  PASS_EMISSION,
-  PASS_BACKGROUND,
-  PASS_AO,
-  PASS_SHADOW,
-  PASS_LIGHT, /* no real pass, used to force use_light_pass */
-  PASS_DIFFUSE_DIRECT,
-  PASS_DIFFUSE_INDIRECT,
   PASS_DIFFUSE_COLOR,
-  PASS_GLOSSY_DIRECT,
-  PASS_GLOSSY_INDIRECT,
   PASS_GLOSSY_COLOR,
-  PASS_TRANSMISSION_DIRECT,
-  PASS_TRANSMISSION_INDIRECT,
   PASS_TRANSMISSION_COLOR,
-  PASS_VOLUME_DIRECT = 50,
-  PASS_VOLUME_INDIRECT,
   /* No Scatter color since it's tricky to define what it would even mean. */
-  PASS_CATEGORY_LIGHT_END = 63,
+  PASS_MIST,
+  PASS_DENOISING_NORMAL,
+  PASS_DENOISING_ALBEDO,
+
+  /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
+   * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
+   * result of this division is then to be multiplied with the backdrop. The alpha channel of this
+   * pass contains number of samples which contributed to the color components of the pass.
+   *
+   * PASS_SHADOW_CATCHER_SAMPLE_COUNT contains number of samples for which the path split
+   * happenned.
+   *
+   * PASS_SHADOW_CATCHER_MATTE contains pass which contains non-catcher objects. This pass is to be
+   * alpha-overed onto the backdrop (after multiplication). */
+  PASS_SHADOW_CATCHER,
+  PASS_SHADOW_CATCHER_SAMPLE_COUNT,
+  PASS_SHADOW_CATCHER_MATTE,
+
+  PASS_CATEGORY_DATA_END = 63,
 
   PASS_BAKE_PRIMITIVE,
   PASS_BAKE_DIFFERENTIAL,
-  PASS_CATEGORY_BAKE_END = 95
+  PASS_CATEGORY_BAKE_END = 95,
+
+  PASS_NUM,
 } PassType;
 
 #define PASS_ANY (~0)
@@ -398,158 +407,9 @@ typedef enum CryptomatteType {
   CRYPT_ACCURATE = (1 << 3),
 } CryptomatteType;
 
-typedef enum DenoisingPassOffsets {
-  DENOISING_PASS_NORMAL = 0,
-  DENOISING_PASS_NORMAL_VAR = 3,
-  DENOISING_PASS_ALBEDO = 6,
-  DENOISING_PASS_ALBEDO_VAR = 9,
-  DENOISING_PASS_DEPTH = 12,
-  DENOISING_PASS_DEPTH_VAR = 13,
-  DENOISING_PASS_SHADOW_A = 14,
-  DENOISING_PASS_SHADOW_B = 17,
-  DENOISING_PASS_COLOR = 20,
-  DENOISING_PASS_COLOR_VAR = 23,
-  DENOISING_PASS_CLEAN = 26,
-
-  DENOISING_PASS_PREFILTERED_DEPTH = 0,
-  DENOISING_PASS_PREFILTERED_NORMAL = 1,
-  DENOISING_PASS_PREFILTERED_SHADOWING = 4,
-  DENOISING_PASS_PREFILTERED_ALBEDO = 5,
-  DENOISING_PASS_PREFILTERED_COLOR = 8,
-  DENOISING_PASS_PREFILTERED_VARIANCE = 11,
-  DENOISING_PASS_PREFILTERED_INTENSITY = 14,
-
-  DENOISING_PASS_SIZE_BASE = 26,
-  DENOISING_PASS_SIZE_CLEAN = 3,
-  DENOISING_PASS_SIZE_PREFILTERED = 15,
-} DenoisingPassOffsets;
-
-typedef enum eBakePassFilter {
-  BAKE_FILTER_NONE = 0,
-  BAKE_FILTER_DIRECT = (1 << 0),
-  BAKE_FILTER_INDIRECT = (1 << 1),
-  BAKE_FILTER_COLOR = (1 << 2),
-  BAKE_FILTER_DIFFUSE = (1 << 3),
-  BAKE_FILTER_GLOSSY = (1 << 4),
-  BAKE_FILTER_TRANSMISSION = (1 << 5),
-  BAKE_FILTER_EMISSION = (1 << 6),
-  BAKE_FILTER_AO = (1 << 7),
-} eBakePassFilter;
-
-typedef enum BakePassFilterCombos {
-  BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE |
-                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION |
-                          BAKE_FILTER_AO),
-  BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION),
-} BakePassFilterCombos;
-
-typedef enum DenoiseFlag {
-  DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
-  DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
-  DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
-  DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
-  DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
-  DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
-  DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1,
-} DenoiseFlag;
-
-typedef ccl_addr_space struct PathRadianceState {
-#ifdef __PASSES__
-  float3 diffuse;
-  float3 glossy;
-  float3 transmission;
-  float3 volume;
-
-  float3 direct;
-#endif
-} PathRadianceState;
-
-typedef ccl_addr_space struct PathRadiance {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
-  float transparent;
-  float3 emission;
-#ifdef __PASSES__
-  float3 background;
-  float3 ao;
-
-  float3 indirect;
-  float3 direct_emission;
-
-  float3 color_diffuse;
-  float3 color_glossy;
-  float3 color_transmission;
-
-  float3 direct_diffuse;
-  float3 direct_glossy;
-  float3 direct_transmission;
-  float3 direct_volume;
-
-  float3 indirect_diffuse;
-  float3 indirect_glossy;
-  float3 indirect_transmission;
-  float3 indirect_volume;
-
-  float3 shadow;
-  float mist;
-#endif
-
-  struct PathRadianceState state;
-
-#ifdef __SHADOW_TRICKS__
-  /* Total light reachable across the path, ignoring shadow blocked queries. */
-  float3 path_total;
-  /* Total light reachable across the path with shadow blocked queries
-   * applied here.
-   *
-   * Dividing this figure by path_total will give estimate of shadow pass.
-   */
-  float3 path_total_shaded;
-
-  /* Color of the background on which shadow is alpha-overed. */
-  float3 shadow_background_color;
-
-  /* Path radiance sum and throughput at the moment when ray hits shadow
-   * catcher object.
-   */
-  float shadow_throughput;
-
-  /* Accumulated transparency along the path after shadow catcher bounce. */
-  float shadow_transparency;
-
-  /* Indicate if any shadow catcher data is set. */
-  int has_shadow_catcher;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  float3 denoising_normal;
-  float3 denoising_albedo;
-  float denoising_depth;
-#endif /* __DENOISING_FEATURES__ */
-} PathRadiance;
-
 typedef struct BsdfEval {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
   float3 diffuse;
-#ifdef __PASSES__
   float3 glossy;
-  float3 transmission;
-  float3 transparent;
-  float3 volume;
-#endif
-#ifdef __SHADOW_TRICKS__
-  float3 sum_no_mis;
-#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -564,8 +424,10 @@ typedef enum ShaderFlag {
   SHADER_EXCLUDE_TRANSMIT = (1 << 25),
   SHADER_EXCLUDE_CAMERA = (1 << 24),
   SHADER_EXCLUDE_SCATTER = (1 << 23),
+  SHADER_EXCLUDE_SHADOW_CATCHER = (1 << 22),
   SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE | SHADER_EXCLUDE_GLOSSY | SHADER_EXCLUDE_TRANSMIT |
-                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER),
+                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER |
+                        SHADER_EXCLUDE_SHADOW_CATCHER),
 
   SHADER_MASK = ~(SHADER_SMOOTH_NORMAL | SHADER_CAST_SHADOW | SHADER_AREA_LIGHT | SHADER_USE_MIS |
                   SHADER_EXCLUDE_ANY)
@@ -612,29 +474,14 @@ typedef struct differential {
 /* Ray */
 
 typedef struct Ray {
-/* TODO(sergey): This is only needed because current AMD
- * compiler has hard time building the kernel with this
- * reshuffle. And at the same time reshuffle will cause
- * less optimal CPU code in certain places.
- *
- * We'll get rid of this nasty exception once AMD compiler
- * is fixed.
- */
-#ifndef __KERNEL_OPENCL_AMD__
   float3 P;   /* origin */
   float3 D;   /* direction */
   float t;    /* length of the ray */
   float time; /* time (for motion blur) */
-#else
-  float t;    /* length of the ray */
-  float time; /* time (for motion blur) */
-  float3 P;   /* origin */
-  float3 D;   /* direction */
-#endif
 
 #ifdef __RAY_DIFFERENTIALS__
-  differential3 dP;
-  differential3 dD;
+  float dP;
+  float dD;
 #endif
 } Ray;
 
@@ -661,9 +508,6 @@ typedef enum PrimitiveType {
   PRIMITIVE_CURVE_RIBBON = (1 << 4),
   PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
   PRIMITIVE_VOLUME = (1 << 6),
-  /* Lamp primitive is not included below on purpose,
-   * since it is no real traceable primitive.
-   */
   PRIMITIVE_LAMP = (1 << 7),
 
   PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
@@ -672,16 +516,14 @@ typedef enum PrimitiveType {
   PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME),
   PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
                           PRIMITIVE_MOTION_CURVE_RIBBON),
-  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME),
+  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME |
+                   PRIMITIVE_LAMP),
 
-  /* Total number of different traceable primitives.
-   * NOTE: This is an actual value, not a bitflag.
-   */
-  PRIMITIVE_NUM_TOTAL = 7,
+  PRIMITIVE_NUM = 8,
 } PrimitiveType;
 
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM)
 
 typedef enum CurveShapeType {
   CURVE_RIBBON = 0,
@@ -760,20 +602,14 @@ typedef struct AttributeDescriptor {
 
 /* Closure data */
 
-#ifdef __MULTI_CLOSURE__
-#  ifdef __SPLIT_KERNEL__
-#    define MAX_CLOSURE 1
-#  else
-#    ifndef __MAX_CLOSURE__
-#      define MAX_CLOSURE 64
-#    else
-#      define MAX_CLOSURE __MAX_CLOSURE__
-#    endif
-#  endif
+#ifndef __MAX_CLOSURE__
+#  define MAX_CLOSURE 64
 #else
-#  define MAX_CLOSURE 1
+#  define MAX_CLOSURE __MAX_CLOSURE__
 #endif
 
+#define MAX_VOLUME_CLOSURE 8
+
 /* This struct is the base class for all closures. The common members are
  * duplicated in all derived classes since we don't have C++ in the kernel
  * yet, and because it lets us lay out the members to minimize padding. The
@@ -866,11 +702,14 @@ enum ShaderDataFlag {
   SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
   /* Shader has emission */
   SD_HAS_EMISSION = (1 << 29),
+  /* Shader has raytracing */
+  SD_HAS_RAYTRACE = (1 << 30),
 
   SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
                      SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
                      SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT |
-                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES)
+                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES | SD_HAS_EMISSION |
+                     SD_HAS_RAYTRACE)
 };
 
 /* Object flags. */
@@ -955,19 +794,19 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
 #endif
 
 #ifdef __OBJECT_MOTION__
-  /* object <-> world space transformations, cached to avoid
-   * re-interpolating them constantly for shading */
-  Transform ob_tfm;
-  Transform ob_itfm;
+  /* Object <-> world space transformations for motion blur, cached to avoid
+   * re-interpolating them constantly for shading. */
+  Transform ob_tfm_motion;
+  Transform ob_itfm_motion;
 #endif
 
   /* ray start position, only set for backgrounds */
   float3 ray_P;
-  differential3 ray_dP;
+  float ray_dP;
 
 #ifdef __OSL__
-  struct KernelGlobals *osl_globals;
-  struct PathState *osl_path_state;
+  const struct KernelGlobals *osl_globals;
+  const struct IntegratorStateCPU *osl_path_state;
 #endif
 
   /* LCG state for closures that require additional random numbers. */
@@ -976,7 +815,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
   /* Closure data, we store a fixed array of closures */
   int num_closure;
   int num_closure_left;
-  float randb_closure;
   float3 svm_closure_weight;
 
   /* Closure weights summed directly, so we can evaluate
@@ -998,7 +836,22 @@ typedef ccl_addr_space struct ccl_align(16) ShaderDataTinyStorage
 ShaderDataTinyStorage;
 #define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData *)shader_data_tiny_storage)
 
-/* Path State */
+/* Compact volume closures storage.
+ *
+ * Used for decoupled direct/indirect light closure storage. */
+
+ccl_addr_space struct ShaderVolumeClosure {
+  float3 weight;
+  float sample_weight;
+  float g;
+};
+
+ccl_addr_space struct ShaderVolumePhases {
+  ShaderVolumeClosure closure[MAX_VOLUME_CLOSURE];
+  int num_closure;
+};
+
+/* Volume Stack */
 
 #ifdef __VOLUME__
 typedef struct VolumeStack {
@@ -1007,53 +860,6 @@ typedef struct VolumeStack {
 } VolumeStack;
 #endif
 
-typedef struct PathState {
-  /* see enum PathRayFlag */
-  int flag;
-
-  /* random number generator state */
-  uint rng_hash;       /* per pixel hash */
-  int rng_offset;      /* dimension offset */
-  int sample;          /* path sample number */
-  int num_samples;     /* total number of times this path will be sampled */
-  float branch_factor; /* number of branches in indirect paths */
-
-  /* bounce counting */
-  int bounce;
-  int diffuse_bounce;
-  int glossy_bounce;
-  int transmission_bounce;
-  int transparent_bounce;
-
-#ifdef __DENOISING_FEATURES__
-  float denoising_feature_weight;
-  float3 denoising_feature_throughput;
-#endif /* __DENOISING_FEATURES__ */
-
-  /* multiple importance sampling */
-  float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
-  float ray_pdf;     /* last bounce pdf */
-#ifdef __LAMP_MIS__
-  float ray_t; /* accumulated distance through transparent surfaces */
-#endif
-
-  /* volume rendering */
-#ifdef __VOLUME__
-  int volume_bounce;
-  int volume_bounds_bounce;
-  VolumeStack volume_stack[VOLUME_STACK_SIZE];
-#endif
-} PathState;
-
-#ifdef __VOLUME__
-typedef struct VolumeState {
-#  ifdef __SPLIT_KERNEL__
-#  else
-  PathState ps;
-#  endif
-} VolumeState;
-#endif
-
 /* Struct to gather multiple nearby intersections. */
 typedef struct LocalIntersection {
   Ray ray;
@@ -1064,20 +870,6 @@ typedef struct LocalIntersection {
   float3 Ng[LOCAL_MAX_HITS];
 } LocalIntersection;
 
-/* Subsurface */
-
-/* Struct to gather SSS indirect rays and delay tracing them. */
-typedef struct SubsurfaceIndirectRays {
-  PathState state[BSSRDF_MAX_HITS];
-
-  int num_rays;
-
-  struct Ray rays[BSSRDF_MAX_HITS];
-  float3 throughputs[BSSRDF_MAX_HITS];
-  struct PathRadianceState L_state[BSSRDF_MAX_HITS];
-} SubsurfaceIndirectRays;
-static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high.");
-
 /* Constant Kernel Data
  *
  * These structs are passed from CPU to various devices, and the struct layout
@@ -1128,7 +920,7 @@ typedef struct KernelCamera {
 
   /* render size */
   float width, height;
-  int resolution;
+  int pad1;
 
   /* anamorphic lens bokeh */
   float inv_aperture_ratio;
@@ -1169,11 +961,12 @@ typedef struct KernelFilm {
 
   int light_pass_flag;
   int pass_stride;
-  int use_light_pass;
 
   int pass_combined;
   int pass_depth;
+  int pass_position;
   int pass_normal;
+  int pass_roughness;
   int pass_motion;
 
   int pass_motion_weight;
@@ -1202,7 +995,13 @@ typedef struct KernelFilm {
 
   int pass_shadow;
   float pass_shadow_scale;
+
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+
   int filter_table_offset;
+
   int cryptomatte_passes;
   int cryptomatte_depth;
   int pass_cryptomatte;
@@ -1215,15 +1014,11 @@ typedef struct KernelFilm {
   float mist_inv_depth;
   float mist_falloff;
 
-  int pass_denoising_data;
-  int pass_denoising_clean;
-  int denoising_flags;
+  int pass_denoising_normal;
+  int pass_denoising_albedo;
 
   int pass_aov_color;
   int pass_aov_value;
-  int pass_aov_color_num;
-  int pass_aov_value_num;
-  int pad1, pad2, pad3;
 
   /* XYZ to rendering color space transform. float4 instead of float3 to
    * ensure consistent padding/alignment across devices. */
@@ -1234,19 +1029,54 @@ typedef struct KernelFilm {
 
   int pass_bake_primitive;
   int pass_bake_differential;
-  int pad;
 
-  /* viewport rendering options */
-  int display_pass_stride;
-  int display_pass_components;
-  int display_divide_pass_stride;
-  int use_display_exposure;
-  int use_display_pass_alpha;
+  int use_approximate_shadow_catcher;
 
-  int pad4, pad5, pad6;
+  int pad1, pad2, pad3;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
+typedef struct KernelFilmConvert {
+  int pass_offset;
+  int pass_stride;
+
+  int pass_use_exposure;
+  int pass_use_filter;
+
+  int pass_divide;
+  int pass_indirect;
+
+  int pass_combined;
+  int pass_sample_count;
+  int pass_adaptive_aux_buffer;
+  int pass_motion_weight;
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+  int pass_background;
+
+  float scale;
+  float exposure;
+  float scale_exposure;
+
+  int use_approximate_shadow_catcher;
+  int use_approximate_shadow_catcher_background;
+  int show_active_pixels;
+
+  /* Number of components to write to. */
+  int num_components;
+
+  /* Number of floats per pixel. When zero is the same as `num_components`.
+   * NOTE: Is ignored for half4 destination. */
+  int pixel_stride;
+
+  int is_denoised;
+
+  /* Padding. */
+  int pad1;
+} KernelFilmConvert;
+static_assert_align(KernelFilmConvert, 16);
+
 typedef struct KernelBackground {
   /* only shader index */
   int surface_shader;
@@ -1255,11 +1085,6 @@ typedef struct KernelBackground {
   int transparent;
   float transparent_roughness_squared_threshold;
 
-  /* ambient occlusion */
-  float ao_factor;
-  float ao_distance;
-  float ao_bounces_factor;
-
   /* portal sampling */
   float portal_weight;
   int num_portals;
@@ -1277,13 +1102,15 @@ typedef struct KernelBackground {
   int map_res_y;
 
   int use_mis;
+
+  /* Padding */
+  int pad1, pad2, pad3;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
 typedef struct KernelIntegrator {
   /* emission */
   int use_direct_light;
-  int use_ambient_occlusion;
   int num_distribution;
   int num_all_lights;
   float pdf_triangles;
@@ -1299,7 +1126,10 @@ typedef struct KernelIntegrator {
   int max_transmission_bounce;
   int max_volume_bounce;
 
+  /* AO bounces */
   int ao_bounces;
+  float ao_bounces_distance;
+  float ao_bounces_factor;
 
   /* transparent */
   int transparent_min_bounce;
@@ -1318,39 +1148,20 @@ typedef struct KernelIntegrator {
   float sample_clamp_direct;
   float sample_clamp_indirect;
 
-  /* branched path */
-  int branched;
-  int volume_decoupled;
-  int diffuse_samples;
-  int glossy_samples;
-  int transmission_samples;
-  int ao_samples;
-  int mesh_light_samples;
-  int subsurface_samples;
-  int sample_all_lights_direct;
-  int sample_all_lights_indirect;
-
   /* mis */
   int use_lamp_mis;
 
   /* sampler */
   int sampling_pattern;
-  int aa_samples;
-  int adaptive_min_samples;
-  int adaptive_step;
-  int adaptive_stop_per_sample;
-  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
   int volume_max_steps;
   float volume_step_rate;
-  int volume_samples;
-
-  int start_sample;
 
-  int max_closures;
+  int has_shadow_catcher;
 
+  /* padding */
   int pad1, pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
@@ -1401,14 +1212,19 @@ typedef struct KernelTables {
 static_assert_align(KernelTables, 16);
 
 typedef struct KernelBake {
+  int use;
   int object_index;
   int tri_offset;
-  int type;
-  int pass_filter;
+  int pad1;
 } KernelBake;
 static_assert_align(KernelBake, 16);
 
 typedef struct KernelData {
+  uint kernel_features;
+  uint max_closures;
+  uint max_shaders;
+  uint pad;
+
   KernelCamera cam;
   KernelFilm film;
   KernelBackground background;
@@ -1485,11 +1301,10 @@ typedef struct KernelLight {
   int type;
   float co[3];
   int shader_id;
-  int samples;
   float max_bounces;
   float random;
   float strength[3];
-  float pad1;
+  float pad1, pad2;
   Transform tfm;
   Transform itfm;
   union {
@@ -1539,110 +1354,6 @@ typedef struct KernelShader {
 } KernelShader;
 static_assert_align(KernelShader, 16);
 
-/* Declarations required for split kernel */
-
-/* Macro for queues */
-/* Value marking queue's empty slot */
-#define QUEUE_EMPTY_SLOT -1
-
-/*
- * Queue 1 - Active rays
- * Queue 2 - Background queue
- * Queue 3 - Shadow ray cast kernel - AO
- * Queue 4 - Shadow ray cast kernel - direct lighting
- */
-
-/* Queue names */
-enum QueueNumber {
-  /* All active rays and regenerated rays are enqueued here. */
-  QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0,
-
-  /* All
-   * 1. Background-hit rays,
-   * 2. Rays that has exited path-iteration but needs to update output buffer
-   * 3. Rays to be regenerated
-   * are enqueued here.
-   */
-  QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contribution for AO are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contributing for direct lighting are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-
-  /* Rays sorted according to shader->id */
-  QUEUE_SHADER_SORTED_RAYS,
-
-#ifdef __BRANCHED_PATH__
-  /* All rays moving to next iteration of the indirect loop for light */
-  QUEUE_LIGHT_INDIRECT_ITER,
-  /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
-  QUEUE_INACTIVE_RAYS,
-#  ifdef __VOLUME__
-  /* All rays moving to next iteration of the indirect loop for volumes */
-  QUEUE_VOLUME_INDIRECT_ITER,
-#  endif
-#  ifdef __SUBSURFACE__
-  /* All rays moving to next iteration of the indirect loop for subsurface */
-  QUEUE_SUBSURFACE_INDIRECT_ITER,
-#  endif
-#endif /* __BRANCHED_PATH__ */
-
-  NUM_QUEUES
-};
-
-/* We use RAY_STATE_MASK to get ray_state */
-#define RAY_STATE_MASK 0x0F
-#define RAY_FLAG_MASK 0xF0
-enum RayState {
-  RAY_INVALID = 0,
-  /* Denotes ray is actively involved in path-iteration. */
-  RAY_ACTIVE,
-  /* Denotes ray has completed processing all samples and is inactive. */
-  RAY_INACTIVE,
-  /* Denotes ray has exited path-iteration and needs to update output buffer. */
-  RAY_UPDATE_BUFFER,
-  /* Denotes ray needs to skip most surface shader work. */
-  RAY_HAS_ONLY_VOLUME,
-  /* Denotes ray has hit background */
-  RAY_HIT_BACKGROUND,
-  /* Denotes ray has to be regenerated */
-  RAY_TO_REGENERATE,
-  /* Denotes ray has been regenerated */
-  RAY_REGENERATED,
-  /* Denotes ray is moving to next iteration of the branched indirect loop */
-  RAY_LIGHT_INDIRECT_NEXT_ITER,
-  RAY_VOLUME_INDIRECT_NEXT_ITER,
-  RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
-
-  /* Ray flags */
-
-  /* Flags to denote that the ray is currently evaluating the branched indirect loop */
-  RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
-  RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
-  RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
-  RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT |
-                           RAY_BRANCHED_SUBSURFACE_INDIRECT),
-
-  /* Ray is evaluating an iteration of an indirect loop for another thread */
-  RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
-};
-
-#define ASSIGN_RAY_STATE(ray_state, ray_index, state) \
-  (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) \
-  ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
-#define ADD_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] | flag))
-#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
-#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
-
 /* Patches */
 
 #define PATCH_MAX_CONTROL_VERTS 16
@@ -1655,7 +1366,7 @@ enum RayState {
 
 /* Work Tiles */
 
-typedef struct WorkTile {
+typedef struct KernelWorkTile {
   uint x, y, w, h;
 
   uint start_sample;
@@ -1664,13 +1375,172 @@ typedef struct WorkTile {
   int offset;
   uint stride;
 
-  ccl_global float *buffer;
-} WorkTile;
+  /* Precalculated parameters used by init_from_camera kernel on GPU. */
+  int path_index_offset;
+  int work_size;
+} KernelWorkTile;
+
+/* Shader Evaluation.
+ *
+ * Position on a primitive on an object at which we want to evaluate the
+ * shader for e.g. mesh displacement or light importance map. */
+
+typedef struct KernelShaderEvalInput {
+  int object;
+  int prim;
+  float u, v;
+} KernelShaderEvalInput;
+static_assert_align(KernelShaderEvalInput, 16);
 
 /* Pre-computed sample table sizes for PMJ02 sampler. */
-#define NUM_PMJ_SAMPLES (64 * 64)
-#define NUM_PMJ_PATTERNS 48
+#define NUM_PMJ_DIVISIONS 32
+#define NUM_PMJ_SAMPLES ((NUM_PMJ_DIVISIONS) * (NUM_PMJ_DIVISIONS))
+#define NUM_PMJ_PATTERNS 1
 
-CCL_NAMESPACE_END
+/* Device kernels.
+ *
+ * Identifier for kernels that can be executed in device queues.
+ *
+ * Some implementation details.
+ *
+ * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
+ * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
+
+typedef enum DeviceKernel {
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL,
+
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+  DEVICE_KERNEL_INTEGRATOR_RESET,
+  DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,
+
+  DEVICE_KERNEL_SHADER_EVAL_DISPLACE,
+  DEVICE_KERNEL_SHADER_EVAL_BACKGROUND,
+
+#define DECLARE_FILM_CONVERT_KERNEL(variant) \
+  DEVICE_KERNEL_FILM_CONVERT_##variant, DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA
+
+  DECLARE_FILM_CONVERT_KERNEL(DEPTH),
+  DECLARE_FILM_CONVERT_KERNEL(MIST),
+  DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT),
+  DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
+  DECLARE_FILM_CONVERT_KERNEL(MOTION),
+  DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER_MATTE_WITH_SHADOW),
+  DECLARE_FILM_CONVERT_KERNEL(COMBINED),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT4),
+
+#undef DECLARE_FILM_CONVERT_KERNEL
+
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y,
+
+  DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS,
+  DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO,
+  DEVICE_KERNEL_FILTER_COLOR_PREPROCESS,
+  DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS,
+
+  DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS,
+
+  DEVICE_KERNEL_PREFIX_SUM,
+
+  DEVICE_KERNEL_NUM,
+} DeviceKernel;
+
+enum {
+  DEVICE_KERNEL_INTEGRATOR_NUM = DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL + 1,
+};
+
+/* Kernel Features */
+
+enum KernelFeatureFlag : unsigned int {
+  /* Shader nodes. */
+  KERNEL_FEATURE_NODE_BSDF = (1U << 0U),
+  KERNEL_FEATURE_NODE_EMISSION = (1U << 1U),
+  KERNEL_FEATURE_NODE_VOLUME = (1U << 2U),
+  KERNEL_FEATURE_NODE_HAIR = (1U << 3U),
+  KERNEL_FEATURE_NODE_BUMP = (1U << 4U),
+  KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 5U),
+  KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 6U),
+  KERNEL_FEATURE_NODE_RAYTRACE = (1U << 7U),
+
+  /* Use denoising kernels and output denoising passes. */
+  KERNEL_FEATURE_DENOISING = (1U << 8U),
+
+  /* Use path tracing kernels. */
+  KERNEL_FEATURE_PATH_TRACING = (1U << 9U),
 
-#endif /*  __KERNEL_TYPES_H__ */
+  /* BVH/sampling kernel features. */
+  KERNEL_FEATURE_HAIR = (1U << 10U),
+  KERNEL_FEATURE_HAIR_THICK = (1U << 11U),
+  KERNEL_FEATURE_OBJECT_MOTION = (1U << 12U),
+  KERNEL_FEATURE_CAMERA_MOTION = (1U << 13U),
+
+  /* Denotes whether baking functionality is needed. */
+  KERNEL_FEATURE_BAKING = (1U << 14U),
+
+  /* Use subsurface scattering materials. */
+  KERNEL_FEATURE_SUBSURFACE = (1U << 15U),
+
+  /* Use volume materials. */
+  KERNEL_FEATURE_VOLUME = (1U << 16U),
+
+  /* Use OpenSubdiv patch evaluation */
+  KERNEL_FEATURE_PATCH_EVALUATION = (1U << 17U),
+
+  /* Use Transparent shadows */
+  KERNEL_FEATURE_TRANSPARENT = (1U << 18U),
+
+  /* Use shadow catcher. */
+  KERNEL_FEATURE_SHADOW_CATCHER = (1U << 19U),
+
+  /* Per-uber shader usage flags. */
+  KERNEL_FEATURE_PRINCIPLED = (1U << 20U),
+
+  /* Light render passes. */
+  KERNEL_FEATURE_LIGHT_PASSES = (1U << 21U),
+
+  /* Shadow render pass. */
+  KERNEL_FEATURE_SHADOW_PASS = (1U << 22U),
+};
+
+/* Shader node feature mask, to specialize shader evaluation for kernels. */
+
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \
+  (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \
+   KERNEL_FEATURE_NODE_HAIR | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
+   KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE \
+  (KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW | KERNEL_FEATURE_NODE_RAYTRACE)
+#define KERNEL_FEATURE_NODE_MASK_VOLUME \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_DISPLACEMENT \
+  (KERNEL_FEATURE_NODE_VORONOI_EXTRA | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE)
+#define KERNEL_FEATURE_NODE_MASK_BUMP KERNEL_FEATURE_NODE_MASK_DISPLACEMENT
+
+#define KERNEL_NODES_FEATURE(feature) ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
deleted file mode 100644
index f6b34be040e..00000000000
--- a/intern/cycles/kernel/kernel_volume.h
+++ /dev/null
@@ -1,1440 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
- * and precision issues.
- * todo: this value could be tweaked or turned into a probability to avoid unnecessary
- * work in volumes and subsurface scattering. */
-#define VOLUME_THROUGHPUT_EPSILON 1e-6f
-
-/* Events for probalistic scattering */
-
-typedef enum VolumeIntegrateResult {
-  VOLUME_PATH_SCATTERED = 0,
-  VOLUME_PATH_ATTENUATED = 1,
-  VOLUME_PATH_MISSED = 2
-} VolumeIntegrateResult;
-
-/* Volume shader properties
- *
- * extinction coefficient = absorption coefficient + scattering coefficient
- * sigma_t = sigma_a + sigma_s */
-
-typedef struct VolumeShaderCoefficients {
-  float3 sigma_t;
-  float3 sigma_s;
-  float3 emission;
-} VolumeShaderCoefficients;
-
-#ifdef __VOLUME__
-
-/* evaluate shader to get extinction coefficient at P */
-ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 P,
-                                                       float3 *extinction)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
-
-  if (sd->flag & SD_EXTINCTION) {
-    const float density = object_volume_density(kg, sd->object);
-    *extinction = sd->closure_transparent_extinction * density;
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-
-/* evaluate shader to get absorption, scattering and emission at P */
-ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
-                                            ShaderData *sd,
-                                            ccl_addr_space PathState *state,
-                                            float3 P,
-                                            VolumeShaderCoefficients *coeff)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
-
-  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION)))
-    return false;
-
-  coeff->sigma_s = zero_float3();
-  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
-  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
-
-  if (sd->flag & SD_SCATTER) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_VOLUME(sc->type))
-        coeff->sigma_s += sc->weight;
-    }
-  }
-
-  const float density = object_volume_density(kg, sd->object);
-  coeff->sigma_s *= density;
-  coeff->sigma_t *= density;
-  coeff->emission *= density;
-
-  return true;
-}
-
-#endif /* __VOLUME__ */
-
-ccl_device float3 volume_color_transmittance(float3 sigma, float t)
-{
-  return exp3(-sigma * t);
-}
-
-ccl_device float kernel_volume_channel_get(float3 value, int channel)
-{
-  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
-}
-
-#ifdef __VOLUME__
-
-ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
-{
-  float step_size = FLT_MAX;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    bool heterogeneous = false;
-
-    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
-      heterogeneous = true;
-    }
-    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
-      /* We want to render world or objects without any volume grids
-       * as homogeneous, but can only verify this at run-time since other
-       * heterogeneous volume objects may be using the same shader. */
-      int object = stack[i].object;
-      if (object != OBJECT_NONE) {
-        int object_flag = kernel_tex_fetch(__object_flag, object);
-        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
-          heterogeneous = true;
-        }
-      }
-    }
-
-    if (heterogeneous) {
-      float object_step_size = object_volume_step_size(kg, stack[i].object);
-      object_step_size *= kernel_data.integrator.volume_step_rate;
-      step_size = fminf(object_step_size, step_size);
-    }
-  }
-
-  return step_size;
-}
-
-ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
-{
-  if (kernel_data.integrator.num_all_lights == 0)
-    return 0;
-
-  int method = -1;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    if (shader_flag & SD_VOLUME_MIS) {
-      return SD_VOLUME_MIS;
-    }
-    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
-      if (method == 0)
-        return SD_VOLUME_MIS;
-
-      method = SD_VOLUME_EQUIANGULAR;
-    }
-    else {
-      if (method == SD_VOLUME_EQUIANGULAR)
-        return SD_VOLUME_MIS;
-
-      method = 0;
-    }
-  }
-
-  return method;
-}
-
-ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               const float object_step_size,
-                                               float t,
-                                               float *step_size,
-                                               float *step_shade_offset,
-                                               float *steps_offset)
-{
-  const int max_steps = kernel_data.integrator.volume_max_steps;
-  float step = min(object_step_size, t);
-
-  /* compute exact steps in advance for malloc */
-  if (t > max_steps * step) {
-    step = t / (float)max_steps;
-  }
-
-  *step_size = step;
-
-  /* Perform shading at this offset within a step, to integrate over
-   * over the entire step segment. */
-  *step_shade_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4);
-
-  /* Shift starting point of all segment by this random amount to avoid
-   * banding artifacts from the volume bounding shape. */
-  *steps_offset = path_state_rng_1D_hash(kg, state, 0x3d22c7b3);
-}
-
-/* Volume Shadows
- *
- * These functions are used to attenuate shadow rays to lights. Both absorption
- * and scattering will block light, represented by the extinction coefficient. */
-
-/* homogeneous volume: assume shader evaluation at the starts gives
- * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state,
-                                                 Ray *ray,
-                                                 ShaderData *sd,
-                                                 float3 *throughput)
-{
-  float3 sigma_t = zero_float3();
-
-  if (volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
-    *throughput *= volume_color_transmittance(sigma_t, ray->t);
-}
-
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   Ray *ray,
-                                                   ShaderData *sd,
-                                                   float3 *throughput,
-                                                   const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * For shadows we do not offset all segments, since the starting point is
-   * already a random distance inside the volume. It also appears to create
-   * banding artifacts for unknown reasons. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, unused;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &unused);
-  const float steps_offset = 1.0f;
-
-  /* compute extinction at the start */
-  float t = 0.0f;
-
-  float3 sum = zero_float3();
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    float3 sigma_t = zero_float3();
-
-    /* compute attenuation over segment */
-    if (volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
-      /* Compute expf() only for every Nth step, to save some calculations
-       * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
-       * check then. */
-      sum += (-sigma_t * dt);
-      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
-        tp = *throughput * exp3(sum);
-
-        /* stop if nearly all light is blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON)
-          break;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t) {
-      /* Update throughput in case we haven't done it above */
-      tp = *throughput * exp3(sum);
-      break;
-    }
-  }
-
-  *throughput = tp;
-}
-
-/* get the volume attenuation over line segment defined by ray, with the
- * assumption that there are no surfaces blocking light between the endpoints */
-#  if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg,
-                                            ShaderData *shadow_sd,
-                                            ccl_addr_space PathState *state,
-                                            Ray *ray,
-                                            float3 *throughput)
-{
-  optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput);
-}
-extern "C" __device__ void __direct_callable__kernel_volume_shadow(
-#  else
-ccl_device_noinline void kernel_volume_shadow(
-#  endif
-    KernelGlobals *kg,
-    ShaderData *shadow_sd,
-    ccl_addr_space PathState *state,
-    Ray *ray,
-    float3 *throughput)
-{
-  shader_setup_from_volume(kg, shadow_sd, ray);
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  if (step_size != FLT_MAX)
-    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size);
-  else
-    kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
-}
-
-#endif /* __VOLUME__ */
-
-/* Equi-angular sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, float xi, float *pdf)
-{
-  float t = ray->t;
-
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
-  if (UNLIKELY(theta_b == theta_a)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return min(t, delta + t_); /* min is only for float precision errors */
-}
-
-ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
-{
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    return 0.0f;
-  }
-
-  float t = ray->t;
-  float t_ = sample_t - delta;
-
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  if (UNLIKELY(theta_b == theta_a)) {
-    return 0.0f;
-  }
-
-  float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return pdf;
-}
-
-/* Distance sampling */
-
-ccl_device float kernel_volume_distance_sample(
-    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
-{
-  /* xi is [0, 1[ so log(0) should never happen, division by zero is
-   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
-  float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float sample_transmittance = kernel_volume_channel_get(full_transmittance, channel);
-
-  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
-
-  *transmittance = volume_color_transmittance(sigma_t, sample_t);
-  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
-
-  /* todo: optimization: when taken together with hit/miss decision,
-   * the full_transmittance cancels out drops out and xi does not
-   * need to be remapped */
-
-  return sample_t;
-}
-
-ccl_device float3 kernel_volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
-{
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
-
-  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
-}
-
-/* Emission */
-
-ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coeff,
-                                                   int closure_flag,
-                                                   float3 transmittance,
-                                                   float t)
-{
-  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
-   * this goes to E * t as sigma_t goes to zero
-   *
-   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
-  float3 emission = coeff->emission;
-
-  if (closure_flag & SD_EXTINCTION) {
-    float3 sigma_t = coeff->sigma_t;
-
-    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
-    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
-    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
-  }
-  else
-    emission *= t;
-
-  return emission;
-}
-
-/* Volume Path */
-
-ccl_device int kernel_volume_sample_channel(float3 albedo,
-                                            float3 throughput,
-                                            float rand,
-                                            float3 *pdf)
-{
-  /* Sample color channel proportional to throughput and single scattering
-   * albedo, to significantly reduce noise with many bounce, following:
-   *
-   * "Practical and Controllable Subsurface Scattering for Production Path
-   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-  float3 weights = fabs(throughput * albedo);
-  float sum_weights = weights.x + weights.y + weights.z;
-  float3 weights_pdf;
-
-  if (sum_weights > 0.0f) {
-    weights_pdf = weights / sum_weights;
-  }
-  else {
-    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
-  }
-
-  *pdf = weights_pdf;
-
-  /* OpenCL does not support -> on float3, so don't use pdf->x. */
-  if (rand < weights_pdf.x) {
-    return 0;
-  }
-  else if (rand < weights_pdf.x + weights_pdf.y) {
-    return 1;
-  }
-  else {
-    return 2;
-  }
-}
-
-#ifdef __VOLUME__
-
-/* homogeneous volume: assume shader evaluation at the start gives
- * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-                                    ccl_addr_space PathState *state,
-                                    Ray *ray,
-                                    ShaderData *sd,
-                                    PathRadiance *L,
-                                    ccl_addr_space float3 *throughput,
-                                    bool probalistic_scatter)
-{
-  VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-  if (!volume_shader_sample(kg, sd, state, ray->P, &coeff))
-    return VOLUME_PATH_MISSED;
-
-  int closure_flag = sd->flag;
-  float t = ray->t;
-  float3 new_tp;
-
-#  ifdef __VOLUME_SCATTER__
-  /* randomly scatter, and if we do t is shortened */
-  if (closure_flag & SD_SCATTER) {
-    /* Sample channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf);
-
-    /* decide if we will hit or miss */
-    bool scatter = true;
-    float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    if (probalistic_scatter) {
-      float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-      float sample_transmittance = expf(-sample_sigma_t * t);
-
-      if (1.0f - xi >= sample_transmittance) {
-        scatter = true;
-
-        /* rescale random number so we can reuse it */
-        xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-      }
-      else
-        scatter = false;
-    }
-
-    if (scatter) {
-      /* scattering */
-      float3 pdf;
-      float3 transmittance;
-      float sample_t;
-
-      /* distance sampling */
-      sample_t = kernel_volume_distance_sample(
-          ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf);
-
-      /* modify pdf for hit/miss decision */
-      if (probalistic_scatter)
-        pdf *= one_float3() - volume_color_transmittance(coeff.sigma_t, t);
-
-      new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf);
-      t = sample_t;
-    }
-    else {
-      /* no scattering */
-      float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-      float pdf = dot(channel_pdf, transmittance);
-      new_tp = *throughput * transmittance / pdf;
-    }
-  }
-  else
-#  endif
-      if (closure_flag & SD_EXTINCTION) {
-    /* absorption only, no sampling needed */
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-    new_tp = *throughput * transmittance;
-  }
-  else {
-    new_tp = *throughput;
-  }
-
-  /* integrate emission attenuated by extinction */
-  if (L && (closure_flag & SD_EMISSION)) {
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t);
-    float3 emission = kernel_volume_emission_integrate(
-        &coeff, closure_flag, transmittance, ray->t);
-    path_radiance_accum_emission(kg, L, state, *throughput, emission);
-  }
-
-  /* modify throughput */
-  if (closure_flag & SD_EXTINCTION) {
-    *throughput = new_tp;
-
-    /* prepare to scatter to new direction */
-    if (t < ray->t) {
-      /* adjust throughput and move to new location */
-      sd->P = ray->P + t * ray->D;
-
-      return VOLUME_PATH_SCATTERED;
-    }
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* heterogeneous volume distance sampling: integrate stepping through the
- * volume until we reach the end, get absorbed entirely, or run out of
- * iterations. this does probabilistically scatter or get transmitted through
- * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space float3 *throughput,
-                                               const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * Using a different step offset for the first step avoids banding artifacts. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, steps_offset;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-  /* compute coefficients at the start */
-  float t = 0.0f;
-  float3 accum_transmittance = one_float3();
-
-  /* pick random color channel, we use the Veach one-sample
-   * model with balance heuristic for the channels */
-  float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-  float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-  bool has_scatter = false;
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 new_tp;
-      float3 transmittance;
-      bool scatter = false;
-
-      /* distance sampling */
-#  ifdef __VOLUME_SCATTER__
-      if ((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) {
-        has_scatter = true;
-
-        /* Sample channel, use MIS with balance heuristic. */
-        float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-        float3 channel_pdf;
-        int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf);
-
-        /* compute transmittance over full step */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-
-        /* decide if we will scatter or continue */
-        float sample_transmittance = kernel_volume_channel_get(transmittance, channel);
-
-        if (1.0f - xi >= sample_transmittance) {
-          /* compute sampling distance */
-          float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-          float new_dt = -logf(1.0f - xi) / sample_sigma_t;
-          new_t = t + new_dt;
-
-          /* transmittance and pdf */
-          float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
-          float3 pdf = coeff.sigma_t * new_transmittance;
-
-          /* throughput */
-          new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf);
-          scatter = true;
-        }
-        else {
-          /* throughput */
-          float pdf = dot(channel_pdf, transmittance);
-          new_tp = tp * transmittance / pdf;
-
-          /* remap xi so we can reuse it and keep thing stratified */
-          xi = 1.0f - (1.0f - xi) / sample_transmittance;
-        }
-      }
-      else
-#  endif
-          if (closure_flag & SD_EXTINCTION) {
-        /* absorption only, no sampling needed */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-        new_tp = tp * transmittance;
-      }
-      else {
-        transmittance = zero_float3();
-        new_tp = tp;
-      }
-
-      /* integrate emission attenuated by absorption */
-      if (L && (closure_flag & SD_EMISSION)) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        path_radiance_accum_emission(kg, L, state, tp, emission);
-      }
-
-      /* modify throughput */
-      if (closure_flag & SD_EXTINCTION) {
-        tp = new_tp;
-
-        /* stop if nearly all light blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON) {
-          tp = zero_float3();
-          break;
-        }
-      }
-
-      /* prepare to scatter to new direction */
-      if (scatter) {
-        /* adjust throughput and move to new location */
-        sd->P = ray->P + new_t * ray->D;
-        *throughput = tp;
-
-        return VOLUME_PATH_SCATTERED;
-      }
-      else {
-        /* accumulate transmittance */
-        accum_transmittance *= transmittance;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-  }
-
-  *throughput = tp;
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints. distance sampling is used to decide if we will
- * scatter or not. */
-ccl_device_noinline_cpu VolumeIntegrateResult
-kernel_volume_integrate(KernelGlobals *kg,
-                        ccl_addr_space PathState *state,
-                        ShaderData *sd,
-                        Ray *ray,
-                        PathRadiance *L,
-                        ccl_addr_space float3 *throughput,
-                        float step_size)
-{
-  shader_setup_from_volume(kg, sd, ray);
-
-  if (step_size != FLT_MAX)
-    return kernel_volume_integrate_heterogeneous_distance(
-        kg, state, ray, sd, L, throughput, step_size);
-  else
-    return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
-}
-
-#  ifndef __SPLIT_KERNEL__
-/* Decoupled Volume Sampling
- *
- * VolumeSegment is list of coefficients and transmittance stored at all steps
- * through a volume. This can then later be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media"
- *
- * On the GPU this is only supported (but currently not enabled)
- * for homogeneous volumes (1 step), due to
- * no support for malloc/free and too much stack usage with a fix size array. */
-
-typedef struct VolumeStep {
-  float3 sigma_s;             /* scatter coefficient */
-  float3 sigma_t;             /* extinction coefficient */
-  float3 accum_transmittance; /* accumulated transmittance including this step */
-  float3 cdf_distance;        /* cumulative density function for distance sampling */
-  float t;                    /* distance at end of this step */
-  float shade_t;              /* jittered distance where shading was done in step */
-  int closure_flag;           /* shader evaluation closure flags */
-} VolumeStep;
-
-typedef struct VolumeSegment {
-  VolumeStep stack_step; /* stack storage for homogeneous step, to avoid malloc */
-  VolumeStep *steps;     /* recorded steps */
-  int numsteps;          /* number of steps */
-  int closure_flag;      /* accumulated closure flags from all steps */
-
-  float3 accum_emission;      /* accumulated emission at end of segment */
-  float3 accum_transmittance; /* accumulated transmittance at end of segment */
-  float3 accum_albedo;        /* accumulated average albedo over segment */
-
-  int sampling_method; /* volume sampling method */
-} VolumeSegment;
-
-/* record volume steps to the end of the volume.
- *
- * it would be nice if we could only record up to the point that we need to scatter,
- * but the entire segment is needed to do always scattering, rather than probabilistically
- * hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratified distance samples up to that transmittance */
-#    ifdef __VOLUME_DECOUPLED__
-ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
-                                               PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               VolumeSegment *segment,
-                                               const float object_step_size)
-{
-  /* prepare for volume stepping */
-  int max_steps;
-  float step_size, step_shade_offset, steps_offset;
-
-  if (object_step_size != FLT_MAX) {
-    max_steps = kernel_data.integrator.volume_max_steps;
-    kernel_volume_step_init(
-        kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-#      ifdef __KERNEL_CPU__
-    /* NOTE: For the branched path tracing it's possible to have direct
-     * and indirect light integration both having volume segments allocated.
-     * We detect this using index in the pre-allocated memory. Currently we
-     * only support two segments allocated at a time, if more needed some
-     * modifications to the KernelGlobals will be needed.
-     *
-     * This gives us restrictions that decoupled record should only happen
-     * in the stack manner, meaning if there's subsequent call of decoupled
-     * record it'll need to free memory before its caller frees memory.
-     */
-    const int index = kg->decoupled_volume_steps_index;
-    assert(index < sizeof(kg->decoupled_volume_steps) / sizeof(*kg->decoupled_volume_steps));
-    if (kg->decoupled_volume_steps[index] == NULL) {
-      kg->decoupled_volume_steps[index] = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-    }
-    segment->steps = kg->decoupled_volume_steps[index];
-    ++kg->decoupled_volume_steps_index;
-#      else
-    segment->steps = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-#      endif
-  }
-  else {
-    max_steps = 1;
-    step_size = ray->t;
-    step_shade_offset = 0.0f;
-    steps_offset = 1.0f;
-    segment->steps = &segment->stack_step;
-  }
-
-  /* init accumulation variables */
-  float3 accum_emission = zero_float3();
-  float3 accum_transmittance = one_float3();
-  float3 accum_albedo = zero_float3();
-  float3 cdf_distance = zero_float3();
-  float t = 0.0f;
-
-  segment->numsteps = 0;
-  segment->closure_flag = 0;
-  bool is_last_step_empty = false;
-
-  VolumeStep *step = segment->steps;
-
-  for (int i = 0; i < max_steps; i++, step++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 sigma_t = coeff.sigma_t;
-
-      /* compute average albedo for channel sampling */
-      if (closure_flag & SD_SCATTER) {
-        accum_albedo += (dt / ray->t) * safe_divide_color(coeff.sigma_s, sigma_t);
-      }
-
-      /* compute accumulated transmittance */
-      float3 transmittance = volume_color_transmittance(sigma_t, dt);
-
-      /* compute emission attenuated by absorption */
-      if (closure_flag & SD_EMISSION) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        accum_emission += accum_transmittance * emission;
-      }
-
-      accum_transmittance *= transmittance;
-
-      /* compute pdf for distance sampling */
-      float3 pdf_distance = dt * accum_transmittance * coeff.sigma_s;
-      cdf_distance = cdf_distance + pdf_distance;
-
-      /* write step data */
-      step->sigma_t = sigma_t;
-      step->sigma_s = coeff.sigma_s;
-      step->closure_flag = closure_flag;
-
-      segment->closure_flag |= closure_flag;
-
-      is_last_step_empty = false;
-      segment->numsteps++;
-    }
-    else {
-      if (is_last_step_empty) {
-        /* consecutive empty step, merge */
-        step--;
-      }
-      else {
-        /* store empty step */
-        step->sigma_t = zero_float3();
-        step->sigma_s = zero_float3();
-        step->closure_flag = 0;
-
-        segment->numsteps++;
-        is_last_step_empty = true;
-      }
-    }
-
-    step->accum_transmittance = accum_transmittance;
-    step->cdf_distance = cdf_distance;
-    step->t = new_t;
-    step->shade_t = t + dt * step_shade_offset;
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-
-    /* stop if nearly all light blocked */
-    if (accum_transmittance.x < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.y < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.z < VOLUME_THROUGHPUT_EPSILON)
-      break;
-  }
-
-  /* store total emission and transmittance */
-  segment->accum_emission = accum_emission;
-  segment->accum_transmittance = accum_transmittance;
-  segment->accum_albedo = accum_albedo;
-
-  /* normalize cumulative density function for distance sampling */
-  VolumeStep *last_step = segment->steps + segment->numsteps - 1;
-
-  if (!is_zero(last_step->cdf_distance)) {
-    VolumeStep *step = &segment->steps[0];
-    int numsteps = segment->numsteps;
-    float3 inv_cdf_distance_sum = safe_invert_color(last_step->cdf_distance);
-
-    for (int i = 0; i < numsteps; i++, step++)
-      step->cdf_distance *= inv_cdf_distance_sum;
-  }
-}
-
-ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
-{
-  if (segment->steps != &segment->stack_step) {
-#      ifdef __KERNEL_CPU__
-    /* NOTE: We only allow free last allocated segment.
-     * No random order of alloc/free is supported.
-     */
-    assert(kg->decoupled_volume_steps_index > 0);
-    assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]);
-    --kg->decoupled_volume_steps_index;
-#      else
-    free(segment->steps);
-#      endif
-  }
-}
-#    endif /* __VOLUME_DECOUPLED__ */
-
-/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching.
- *
- * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
-ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(KernelGlobals *kg,
-                                                                 PathState *state,
-                                                                 Ray *ray,
-                                                                 ShaderData *sd,
-                                                                 float3 *throughput,
-                                                                 float rphase,
-                                                                 float rscatter,
-                                                                 const VolumeSegment *segment,
-                                                                 const float3 *light_P,
-                                                                 bool probalistic_scatter)
-{
-  kernel_assert(segment->closure_flag & SD_SCATTER);
-
-  /* Sample color channel, use MIS with balance heuristic. */
-  float3 channel_pdf;
-  int channel = kernel_volume_sample_channel(
-      segment->accum_albedo, *throughput, rphase, &channel_pdf);
-
-  float xi = rscatter;
-
-  /* probabilistic scattering decision based on transmittance */
-  if (probalistic_scatter) {
-    float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
-
-    if (1.0f - xi >= sample_transmittance) {
-      /* rescale random number so we can reuse it */
-      xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-    }
-    else {
-      *throughput /= sample_transmittance;
-      return VOLUME_PATH_MISSED;
-    }
-  }
-
-  VolumeStep *step;
-  float3 transmittance;
-  float pdf, sample_t;
-  float mis_weight = 1.0f;
-  bool distance_sample = true;
-  bool use_mis = false;
-
-  if (segment->sampling_method && light_P) {
-    if (segment->sampling_method == SD_VOLUME_MIS) {
-      /* multiple importance sample: randomly pick between
-       * equiangular and distance sampling strategy */
-      if (xi < 0.5f) {
-        xi *= 2.0f;
-      }
-      else {
-        xi = (xi - 0.5f) * 2.0f;
-        distance_sample = false;
-      }
-
-      use_mis = true;
-    }
-    else {
-      /* only equiangular sampling */
-      distance_sample = false;
-    }
-  }
-
-  /* distance sampling */
-  if (distance_sample) {
-    /* find step in cdf */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float prev_cdf = 0.0f;
-      float step_cdf = 1.0f;
-      float3 prev_cdf_distance = zero_float3();
-
-      for (int i = 0;; i++, step++) {
-        /* todo: optimize using binary search */
-        step_cdf = kernel_volume_channel_get(step->cdf_distance, channel);
-
-        if (xi < step_cdf || i == segment->numsteps - 1)
-          break;
-
-        prev_cdf = step_cdf;
-        prev_t = step->t;
-        prev_cdf_distance = step->cdf_distance;
-      }
-
-      /* remap xi so we can reuse it */
-      xi = (xi - prev_cdf) / (step_cdf - prev_cdf);
-
-      /* pdf for picking step */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-
-    /* sample distance and compute transmittance */
-    float3 distance_pdf;
-    sample_t = prev_t + kernel_volume_distance_sample(
-                            step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-
-    /* modify pdf for hit/miss decision */
-    if (probalistic_scatter)
-      distance_pdf *= one_float3() - segment->accum_transmittance;
-
-    pdf = dot(channel_pdf, distance_pdf * step_pdf_distance);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
-      mis_weight = 2.0f * power_heuristic(pdf, equi_pdf);
-    }
-  }
-  /* equi-angular sampling */
-  else {
-    /* sample distance */
-    sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
-
-    /* find step in which sampled distance is located */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float3 prev_cdf_distance = zero_float3();
-
-      int numsteps = segment->numsteps;
-      int high = numsteps - 1;
-      int low = 0;
-      int mid;
-
-      while (low < high) {
-        mid = (low + high) >> 1;
-
-        if (sample_t < step[mid].t)
-          high = mid;
-        else if (sample_t >= step[mid + 1].t)
-          low = mid + 1;
-        else {
-          /* found our interval in step[mid] .. step[mid+1] */
-          prev_t = step[mid].t;
-          prev_cdf_distance = step[mid].cdf_distance;
-          step += mid + 1;
-          break;
-        }
-      }
-
-      if (low >= numsteps - 1) {
-        prev_t = step[numsteps - 1].t;
-        prev_cdf_distance = step[numsteps - 1].cdf_distance;
-        step += numsteps - 1;
-      }
-
-      /* pdf for picking step with distance sampling */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-    float step_sample_t = sample_t - prev_t;
-
-    /* compute transmittance */
-    transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
-      float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance);
-      mis_weight = 2.0f * power_heuristic(pdf, distance_pdf);
-    }
-  }
-  if (sample_t < 0.0f || pdf == 0.0f) {
-    return VOLUME_PATH_MISSED;
-  }
-
-  /* compute transmittance up to this step */
-  if (step != segment->steps)
-    transmittance *= (step - 1)->accum_transmittance;
-
-  /* modify throughput */
-  *throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
-
-  /* evaluate shader to create closures at shading point */
-  if (segment->numsteps > 1) {
-    sd->P = ray->P + step->shade_t * ray->D;
-
-    VolumeShaderCoefficients coeff;
-    volume_shader_sample(kg, sd, state, sd->P, &coeff);
-  }
-
-  /* move to new position */
-  sd->P = ray->P + sample_t * ray->D;
-
-  return VOLUME_PATH_SCATTERED;
-}
-#  endif /* __SPLIT_KERNEL */
-
-/* decide if we need to use decoupled or not */
-ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg,
-                                            bool heterogeneous,
-                                            bool direct,
-                                            int sampling_method)
-{
-  /* decoupled ray marching for heterogeneous volumes not supported on the GPU,
-   * which also means equiangular and multiple importance sampling is not
-   * support for that case */
-  if (!kernel_data.integrator.volume_decoupled)
-    return false;
-
-#  ifdef __KERNEL_GPU__
-  if (heterogeneous)
-    return false;
-#  endif
-
-  /* equiangular and multiple importance sampling only implemented for decoupled */
-  if (sampling_method != 0)
-    return true;
-
-  /* for all light sampling use decoupled, reusing shader evaluations is
-   * typically faster in that case */
-  if (direct)
-    return kernel_data.integrator.sample_all_lights_direct;
-  else
-    return kernel_data.integrator.sample_all_lights_indirect;
-}
-
-/* Volume Stack
- *
- * This is an array of object/shared ID's that the current segment of the path
- * is inside of. */
-
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
-                                         ShaderData *stack_sd,
-                                         ccl_addr_space const PathState *state,
-                                         ccl_addr_space const Ray *ray,
-                                         ccl_addr_space VolumeStack *stack)
-{
-  /* NULL ray happens in the baker, does it need proper initialization of
-   * camera in volume?
-   */
-  if (!kernel_data.cam.is_inside_volume || ray == NULL) {
-    /* Camera is guaranteed to be in the air, only take background volume
-     * into account in this case.
-     */
-    if (kernel_data.background.volume_shader != SHADER_NONE) {
-      stack[0].shader = kernel_data.background.volume_shader;
-      stack[0].object = PRIM_NONE;
-      stack[1].shader = SHADER_NONE;
-    }
-    else {
-      stack[0].shader = SHADER_NONE;
-    }
-    return;
-  }
-
-  kernel_assert(state->flag & PATH_RAY_CAMERA);
-
-  Ray volume_ray = *ray;
-  volume_ray.t = FLT_MAX;
-
-  const uint visibility = (state->flag & PATH_RAY_ALL_VISIBILITY);
-  int stack_index = 0, enclosed_index = 0;
-
-#  ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
-  if (num_hits > 0) {
-    int enclosed_volumes[VOLUME_STACK_SIZE];
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      if (stack_sd->flag & SD_BACKFACING) {
-        bool need_add = true;
-        for (int i = 0; i < enclosed_index && need_add; ++i) {
-          /* If ray exited the volume and never entered to that volume
-           * it means that camera is inside such a volume.
-           */
-          if (enclosed_volumes[i] == stack_sd->object) {
-            need_add = false;
-          }
-        }
-        for (int i = 0; i < stack_index && need_add; ++i) {
-          /* Don't add intersections twice. */
-          if (stack[i].object == stack_sd->object) {
-            need_add = false;
-            break;
-          }
-        }
-        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
-          stack[stack_index].object = stack_sd->object;
-          stack[stack_index].shader = stack_sd->shader;
-          ++stack_index;
-        }
-      }
-      else {
-        /* If ray from camera enters the volume, this volume shouldn't
-         * be added to the stack on exit.
-         */
-        enclosed_volumes[enclosed_index++] = stack_sd->object;
-      }
-    }
-  }
-#  else
-  int enclosed_volumes[VOLUME_STACK_SIZE];
-  int step = 0;
-
-  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
-         step < 2 * VOLUME_STACK_SIZE) {
-    Intersection isect;
-    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
-      break;
-    }
-
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    if (stack_sd->flag & SD_BACKFACING) {
-      /* If ray exited the volume and never entered to that volume
-       * it means that camera is inside such a volume.
-       */
-      bool need_add = true;
-      for (int i = 0; i < enclosed_index && need_add; ++i) {
-        /* If ray exited the volume and never entered to that volume
-         * it means that camera is inside such a volume.
-         */
-        if (enclosed_volumes[i] == stack_sd->object) {
-          need_add = false;
-        }
-      }
-      for (int i = 0; i < stack_index && need_add; ++i) {
-        /* Don't add intersections twice. */
-        if (stack[i].object == stack_sd->object) {
-          need_add = false;
-          break;
-        }
-      }
-      if (need_add) {
-        stack[stack_index].object = stack_sd->object;
-        stack[stack_index].shader = stack_sd->shader;
-        ++stack_index;
-      }
-    }
-    else {
-      /* If ray from camera enters the volume, this volume shouldn't
-       * be added to the stack on exit.
-       */
-      enclosed_volumes[enclosed_index++] = stack_sd->object;
-    }
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    ++step;
-  }
-#  endif
-  /* stack_index of 0 means quick checks outside of the kernel gave false
-   * positive, nothing to worry about, just we've wasted quite a few of
-   * ticks just to come into conclusion that camera is in the air.
-   *
-   * In this case we're doing the same above -- check whether background has
-   * volume.
-   */
-  if (stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
-    stack[0].shader = kernel_data.background.volume_shader;
-    stack[0].object = OBJECT_NONE;
-    stack[1].shader = SHADER_NONE;
-  }
-  else {
-    stack[stack_index].shader = SHADER_NONE;
-  }
-}
-
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ccl_addr_space VolumeStack *stack)
-{
-  /* todo: we should have some way for objects to indicate if they want the
-   * world shader to work inside them. excluding it by default is problematic
-   * because non-volume objects can't be assumed to be closed manifolds */
-
-  if (!(sd->flag & SD_HAS_VOLUME))
-    return;
-
-  if (sd->flag & SD_BACKFACING) {
-    /* exit volume object: remove from stack */
-    for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-      if (stack[i].object == sd->object) {
-        /* shift back next stack entries */
-        do {
-          stack[i] = stack[i + 1];
-          i++;
-        } while (stack[i].shader != SHADER_NONE);
-
-        return;
-      }
-    }
-  }
-  else {
-    /* enter volume object: add to stack */
-    int i;
-
-    for (i = 0; stack[i].shader != SHADER_NONE; i++) {
-      /* already in the stack? then we have nothing to do */
-      if (stack[i].object == sd->object)
-        return;
-    }
-
-    /* if we exceed the stack limit, ignore */
-    if (i >= VOLUME_STACK_SIZE - 1)
-      return;
-
-    /* add to the end of the stack */
-    stack[i].shader = sd->shader;
-    stack[i].object = sd->object;
-    stack[i + 1].shader = SHADER_NONE;
-  }
-}
-
-#  ifdef __SUBSURFACE__
-ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
-                                                          ShaderData *stack_sd,
-                                                          Ray *ray,
-                                                          ccl_addr_space VolumeStack *stack)
-{
-  kernel_assert(kernel_data.integrator.use_volumes);
-
-  Ray volume_ray = *ray;
-
-#    ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
-  if (num_hits > 0) {
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-    }
-  }
-#    else
-  Intersection isect;
-  int step = 0;
-  float3 Pend = ray->P + ray->D * ray->t;
-  while (step < 2 * VOLUME_STACK_SIZE &&
-         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    if (volume_ray.t != FLT_MAX) {
-      volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t);
-    }
-    ++step;
-  }
-#    endif
-}
-#  endif
-
-/* Clean stack after the last bounce.
- *
- * It is expected that all volumes are closed manifolds, so at the time when ray
- * hits nothing (for example, it is a last bounce which goes to environment) the
- * only expected volume in the stack is the world's one. All the rest volume
- * entries should have been exited already.
- *
- * This isn't always true because of ray intersection precision issues, which
- * could lead us to an infinite non-world volume in the stack, causing render
- * artifacts.
- *
- * Use this function after the last bounce to get rid of all volumes apart from
- * the world's one after the last bounce to avoid render artifacts.
- */
-ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
-                                                 ccl_addr_space VolumeStack *volume_stack)
-{
-  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    /* Keep the world's volume in stack. */
-    volume_stack[1].shader = SHADER_NONE;
-  }
-  else {
-    volume_stack[0].shader = SHADER_NONE;
-  }
-}
-
-#endif /* __VOLUME__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index d1602744f1d..fab0915c38e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_WORK_STEALING_H__
-#define __KERNEL_WORK_STEALING_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN
  */
 
 /* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       uint global_work_index,
                                       ccl_private uint *x,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
+#if 0
+  /* Keep threads for the same sample together. */
   uint tile_pixels = tile->w * tile->h;
   uint sample_offset = global_work_index / tile_pixels;
   uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
+#else
+  /* Keeping threads for the same pixel together.
+   * Appears to improve performance by a few % on CUDA and OptiX. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
 
@@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
   *sample = tile->start_sample + sample_offset;
 }
 
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#endif
-
-#ifdef __SPLIT_KERNEL__
-/* Returns true if there is work */
-ccl_device bool get_next_work_item(KernelGlobals *kg,
-                                   ccl_global uint *work_pools,
-                                   uint total_work_size,
-                                   uint ray_index,
-                                   ccl_private uint *global_work_index)
-{
-  /* With a small amount of work there may be more threads than work due to
-   * rounding up of global size, stop such threads immediately. */
-  if (ray_index >= total_work_size) {
-    return false;
-  }
-
-  /* Increase atomic work index counter in pool. */
-  uint pool = ray_index / WORK_POOL_SIZE;
-  uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
-
-  /* Map per-pool work index to a global work index. */
-  uint global_size = ccl_global_size(0) * ccl_global_size(1);
-  kernel_assert(global_size % WORK_POOL_SIZE == 0);
-  kernel_assert(ray_index < global_size);
-
-  *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
-                       (work_index % WORK_POOL_SIZE);
-
-  /* Test if all work for this pool is done. */
-  return (*global_work_index < total_work_size);
-}
-
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
-{
-  bool got_work = false;
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    do {
-      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-      if (got_work) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                       kernel_data.film.pass_adaptive_aux_buffer);
-        if ((*aux).w == 0.0f) {
-          break;
-        }
-      }
-    } while (got_work);
-  }
-  else {
-    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-  }
-  return got_work;
-}
-#endif
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel_write_passes.h b/intern/cycles/kernel/kernel_write_passes.h
index 410218d91d4..9d379495629 100644
--- a/intern/cycles/kernel/kernel_write_passes.h
+++ b/intern/cycles/kernel/kernel_write_passes.h
@@ -14,23 +14,25 @@
  * limitations under the License.
  */
 
-#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#pragma once
+
+#ifdef __KERNEL_GPU__
 #  define __ATOMIC_PASS_WRITE__
 #endif
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
+ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
 {
-  ccl_global float *buf = buffer;
 #ifdef __ATOMIC_PASS_WRITE__
-  atomic_add_and_fetch_float(buf, value);
+  atomic_add_and_fetch_float(buffer, value);
 #else
-  *buf += value;
+  *buffer += value;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer,
+                                                float3 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -41,12 +43,14 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3
   atomic_add_and_fetch_float(buf_y, value.y);
   atomic_add_and_fetch_float(buf_z, value.z);
 #else
-  ccl_global float3 *buf = (ccl_global float3 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer,
+                                                float4 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -59,37 +63,26 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4
   atomic_add_and_fetch_float(buf_z, value.z);
   atomic_add_and_fetch_float(buf_w, value.w);
 #else
-  ccl_global float4 *buf = (ccl_global float4 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
+  buffer[3] += value.w;
 #endif
 }
 
-#ifdef __DENOISING_FEATURES__
-ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+ccl_device_inline float kernel_read_pass_float(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float(buffer, value);
-
-  /* The online one-pass variance update that's used for the megakernel can't easily be implemented
-   * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
-  kernel_write_pass_float(buffer + 1, value * value);
+  return *buffer;
 }
 
-#  ifdef __ATOMIC_PASS_WRITE__
-#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
-#  else
-ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
 {
-  buffer[0] += value.x;
-  buffer[1] += value.y;
-  buffer[2] += value.z;
+  return make_float3(buffer[0], buffer[1], buffer[2]);
 }
-#  endif
 
-ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float3_unaligned(buffer, value);
-  kernel_write_pass_float3_unaligned(buffer + 3, value * value);
+  return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
 }
-#endif /* __DENOISING_FEATURES__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
deleted file mode 100644
index 145a6b6ac40..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
deleted file mode 100644
index 012daba62d8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
deleted file mode 100644
index 16351a7f949..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
deleted file mode 100644
index 1423b182ab8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleV,
-                                                     float *sampleVV,
-                                                     float *bufferV,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tiles,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample);
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
deleted file mode 100644
index 3d4cb87e104..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-#include "kernel/kernel_compat_cpu.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-#ifdef KERNEL_STUB
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* Denoise filter */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleVariance,
-                                                     float *sampleVarianceV,
-                                                     float *bufferVariance,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
-#else
-  kernel_filter_divide_shadow(sample,
-                              tile_info,
-                              x,
-                              y,
-                              unfilteredA,
-                              unfilteredB,
-                              sampleVariance,
-                              sampleVarianceV,
-                              bufferVariance,
-                              load_int4(prefilter_rect),
-                              buffer_pass_stride,
-                              buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
-#else
-  kernel_filter_get_feature(sample,
-                            tile_info,
-                            m_offset,
-                            v_offset,
-                            x,
-                            y,
-                            mean,
-                            variance,
-                            scale,
-                            load_int4(prefilter_rect),
-                            buffer_pass_stride,
-                            buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
-#else
-  kernel_filter_write_feature(
-      sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
-#else
-  kernel_filter_detect_outliers(
-      x, y, image, variance, depth, output, load_int4(rect), pass_stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
-#else
-  kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tile_info,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *prefilter_rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
-#else
-  rank += storage_ofs;
-  transform += storage_ofs * TRANSFORM_SIZE;
-  kernel_filter_construct_transform(buffer,
-                                    tile_info,
-                                    x,
-                                    y,
-                                    load_int4(prefilter_rect),
-                                    pass_stride,
-                                    frame_stride,
-                                    use_time,
-                                    transform,
-                                    rank,
-                                    radius,
-                                    pca_threshold);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
-#else
-  kernel_filter_nlm_calc_difference(dx,
-                                    dy,
-                                    weight_image,
-                                    variance_image,
-                                    scale_image,
-                                    difference_image,
-                                    load_int4(rect),
-                                    stride,
-                                    channel_offset,
-                                    frame_offset,
-                                    a,
-                                    k_2);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
-#else
-  kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
-#else
-  kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
-#else
-  kernel_filter_nlm_update_output(dx,
-                                  dy,
-                                  difference_image,
-                                  image,
-                                  temp_image,
-                                  out_image,
-                                  accum_image,
-                                  load_int4(rect),
-                                  channel_offset,
-                                  stride,
-                                  f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
-#else
-  kernel_filter_nlm_construct_gramian(dx,
-                                      dy,
-                                      t,
-                                      difference_image,
-                                      buffer,
-                                      transform,
-                                      rank,
-                                      XtWX,
-                                      XtWY,
-                                      load_int4(rect),
-                                      load_int4(filter_window),
-                                      stride,
-                                      f,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
-#else
-  kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_finalize);
-#else
-  XtWX += storage_ofs * XTWX_SIZE;
-  XtWY += storage_ofs * XTWY_SIZE;
-  rank += storage_ofs;
-  kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
-#endif
-}
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
deleted file mode 100644
index 75833d83648..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
deleted file mode 100644
index c998cd54d3a..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
deleted file mode 100644
index fc4ef1fca5b..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
deleted file mode 100644
index ea3103f12c3..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample);
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-/* Split kernels */
-
-void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg,
-                                          ccl_constant KernelData *data,
-                                          ccl_global void *split_data_buffer,
-                                          int num_elements,
-                                          ccl_global char *ray_state,
-                                          int start_sample,
-                                          int end_sample,
-                                          int sx,
-                                          int sy,
-                                          int sw,
-                                          int sh,
-                                          int offset,
-                                          int stride,
-                                          ccl_global int *Queue_index,
-                                          int queuesize,
-                                          ccl_global char *use_queues_flag,
-                                          ccl_global unsigned int *work_pool_wgs,
-                                          unsigned int num_samples,
-                                          ccl_global float *buffer);
-
-#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * data);
-
-DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
-DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
-DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
-DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
deleted file mode 100644
index 51d6c23f72f..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-
-#ifndef KERNEL_STUB
-#  ifndef __SPLIT_KERNEL__
-#    include "kernel/kernel_math.h"
-#    include "kernel/kernel_types.h"
-
-#    include "kernel/split/kernel_split_data.h"
-#    include "kernel/kernel_globals.h"
-
-#    include "kernel/kernel_color.h"
-#    include "kernel/kernels/cpu/kernel_cpu_image.h"
-#    include "kernel/kernel_film.h"
-#    include "kernel/kernel_path.h"
-#    include "kernel/kernel_path_branched.h"
-#    include "kernel/kernel_bake.h"
-#  else
-#    include "kernel/split/kernel_split_common.h"
-
-#    include "kernel/split/kernel_data_init.h"
-#    include "kernel/split/kernel_path_init.h"
-#    include "kernel/split/kernel_scene_intersect.h"
-#    include "kernel/split/kernel_lamp_emission.h"
-#    include "kernel/split/kernel_do_volume.h"
-#    include "kernel/split/kernel_queue_enqueue.h"
-#    include "kernel/split/kernel_indirect_background.h"
-#    include "kernel/split/kernel_shader_setup.h"
-#    include "kernel/split/kernel_shader_sort.h"
-#    include "kernel/split/kernel_shader_eval.h"
-#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#    include "kernel/split/kernel_subsurface_scatter.h"
-#    include "kernel/split/kernel_direct_lighting.h"
-#    include "kernel/split/kernel_shadow_blocked_ao.h"
-#    include "kernel/split/kernel_shadow_blocked_dl.h"
-#    include "kernel/split/kernel_enqueue_inactive.h"
-#    include "kernel/split/kernel_next_iteration_setup.h"
-#    include "kernel/split/kernel_indirect_subsurface.h"
-#    include "kernel/split/kernel_buffer_update.h"
-#    include "kernel/split/kernel_adaptive_stopping.h"
-#    include "kernel/split/kernel_adaptive_filter_x.h"
-#    include "kernel/split/kernel_adaptive_filter_y.h"
-#    include "kernel/split/kernel_adaptive_adjust_samples.h"
-#  endif /* __SPLIT_KERNEL__ */
-#else
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-
-#  ifdef __SPLIT_KERNEL__
-#    include "kernel/split/kernel_data_init.h"
-#  endif /* __SPLIT_KERNEL__ */
-#endif   /* KERNEL_STUB */
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __SPLIT_KERNEL__
-
-/* Path Tracing */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, path_trace);
-#  else
-#    ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched) {
-    kernel_branched_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-  else
-#    endif
-  {
-    kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-/* Film */
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
-#  else
-  kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
-#  else
-  kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-/* Bake */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, bake);
-#  else
-#    ifdef __BAKING__
-  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#    endif
-#  endif /* KERNEL_STUB */
-}
-
-/* Shader Evaluate */
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, shader);
-#  else
-  if (type == SHADER_EVAL_DISPLACE) {
-    kernel_displace_evaluate(kg, input, output, i);
-  }
-  else {
-    kernel_background_evaluate(kg, input, output, i);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-#else /* __SPLIT_KERNEL__ */
-
-/* Split Kernel Path Tracing */
-
-#  ifdef KERNEL_STUB
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-#  else
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        kernel_##name(kg); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        ccl_local type locals; \
-        kernel_##name(kg, &locals); \
-      }
-#  endif /* KERNEL_STUB */
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao,
-                                    BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-#endif   /* __SPLIT_KERNEL__ */
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
deleted file mode 100644
index 989f5e5aaa8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-#define __SPLIT_KERNEL__
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
deleted file mode 100644
index 40e485d27c0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
deleted file mode 100644
index 8c44238470e..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
deleted file mode 100644
index 7a3f218d5fc..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
deleted file mode 100644
index 1cab59e0ea0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
deleted file mode 100644
index 637126d9d4c..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
deleted file mode 100644
index 6c9642d1f03..00000000000
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel_config.h"
-
-#include "kernel/kernel_compat_cuda.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_copy_input(float *buffer,
-                              CCL_FILTER_TILE_INFO,
-                              int4 prefilter_rect,
-                              int buffer_pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-		int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-		int itile = ytile * 3 + xtile;
-		float *const in = ((float *)ccl_get_tile_buffer(itile)) +
-			(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
-		buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
-		for (int i = 0; i < buffer_pass_stride; ++i)
-			buffer[i] = in[i];
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		if (num_inputs > 0) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3;
-			out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
-			out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
-			out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
-		}
-		if (num_inputs > 1) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-		if (num_inputs > 2) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
-		float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
-		out[0] = in[0] * num_samples;
-		out[1] = in[1] * num_samples;
-		out[2] = in[2] * num_samples;
-	}
-}
-
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_divide_shadow(int sample,
-                                 CCL_FILTER_TILE_INFO,
-                                 float *unfilteredA,
-                                 float *unfilteredB,
-                                 float *sampleVariance,
-                                 float *sampleVarianceV,
-                                 float *bufferVariance,
-                                 int4 prefilter_rect,
-                                 int buffer_pass_stride,
-                                 int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            tile_info,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_get_feature(int sample,
-                               CCL_FILTER_TILE_INFO,
-                               int m_offset,
-                               int v_offset,
-                               float *mean,
-                               float *variance,
-                               float scale,
-                               int4 prefilter_rect,
-                               int buffer_pass_stride,
-                               int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          tile_info,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_write_feature(int sample,
-                                 int4 buffer_params,
-                                 int4 filter_area,
-                                 float *from,
-                                 float *buffer,
-                                 int out_offset,
-                                 int4 prefilter_rect)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-	                                x + filter_area.x,
-	                                y + filter_area.y,
-	                                buffer_params,
-	                                from,
-	                                buffer,
-	                                out_offset,
-	                                prefilter_rect);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_detect_outliers(float *image,
-                                   float *variance,
-                                   float *depth,
-                                   float *output,
-                                   int4 prefilter_rect,
-                                   int pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
-                                       CCL_FILTER_TILE_INFO,
-                                       float *transform, int *rank,
-                                       int4 filter_area, int4 rect,
-                                       int radius, float pca_threshold,
-                                       int pass_stride, int frame_stride,
-                                       bool use_time)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int *l_rank = rank + y*filter_area.z + x;
-		float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  tile_info,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
-                                       const float *ccl_restrict variance_image,
-                                       const float *ccl_restrict scale_image,
-                                       float *difference_image,
-                                       int w,
-                                       int h,
-                                       int stride,
-                                       int pass_stride,
-                                       int r,
-                                       int channel_offset,
-                                       int frame_offset,
-                                       float a,
-                                       float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
-                            float *out_image,
-                            int w,
-                            int h,
-                            int stride,
-                            int pass_stride,
-                            int r,
-                            int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                   float *out_image,
-                                   int w,
-                                   int h,
-                                   int stride,
-                                   int pass_stride,
-                                   int r,
-                                   int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
-                                     const float *ccl_restrict image,
-                                     float *out_image,
-                                     float *accum_image,
-                                     int w,
-                                     int h,
-                                     int stride,
-                                     int pass_stride,
-                                     int channel_offset,
-                                     int r,
-                                     int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image,
-                                 const float *ccl_restrict accum_image,
-                                 int w,
-                                 int h,
-                                 int stride)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int t,
-                                         const float *ccl_restrict difference_image,
-                                         const float *ccl_restrict buffer,
-                                         float const* __restrict__ transform,
-                                         int *rank,
-                                         float *XtWX,
-                                         float3 *XtWY,
-                                         int4 filter_window,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f,
-                                         int frame_offset,
-                                         bool use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(float *buffer,
-                            int *rank,
-                            float *XtWX,
-                            float3 *XtWY,
-                            int4 filter_area,
-                            int4 buffer_params,
-                            int sample)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
deleted file mode 100644
index cf62b6e781e..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/cuda/kernel_cuda_image.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-
-#ifdef __BRANCHED_PATH__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-	
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-#endif
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	KernelGlobals kg;
-	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
-		uint x = tile->x + work_index % tile->w;
-		uint y = tile->y + work_index / tile->w;
-		int index = tile->offset + x + y * tile->stride;
-		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-		kernel_do_adaptive_stopping(&kg, buffer, sample);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->h) {
-			int y = tile->y + ccl_global_id(0);
-			kernel_do_adaptive_filter_x(&kg, y, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->w) {
-			int x = tile->x + ccl_global_id(0);
-			kernel_do_adaptive_filter_y(&kg, x, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
-{
-	if(kernel_data.film.pass_adaptive_aux_buffer) {
-		int work_index = ccl_global_id(0);
-		bool thread_is_active = work_index < total_work_size;
-		KernelGlobals kg;
-		if(thread_is_active) {
-			uint x = tile->x + work_index % tile->w;
-			uint y = tile->y + work_index / tile->w;
-			int index = tile->offset + x + y * tile->stride;
-			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-				float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-				if(sample_multiplier != 1.0f) {
-					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
-				}
-			}
-			else {
-				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
-			}
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_displace(uint4 *input,
-                     float4 *output,
-                     int type,
-                     int sx,
-                     int sw,
-                     int offset,
-                     int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_displace_evaluate(&kg, input, output, x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_background(uint4 *input,
-                       float4 *output,
-                       int type,
-                       int sx,
-                       int sw,
-                       int offset,
-                       int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_background_evaluate(&kg, input, output, x);
-	}
-}
-
-#ifdef __BAKING__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		KernelGlobals kg;
-		kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-}
-#endif
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
deleted file mode 100644
index 2e47ce2de6c..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* device data taken from CUDA occupancy calculator */
-
-/* 3.0 and 3.5 */
-#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.x, 6.x */
-#elif __CUDA_ARCH__ <= 699
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
- * registers */
-#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
-#    define CUDA_KERNEL_MAX_REGISTERS 64
-#  else
-#    define CUDA_KERNEL_MAX_REGISTERS 48
-#  endif
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 7.x, 8.x */
-#elif __CUDA_ARCH__ <= 899
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 64
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* For split kernel using all registers seems fastest for now, but this
- * is unlikely to be optimal once we resolve other bottlenecks. */
-
-#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
-
-/* Compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread. */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-  __launch_bounds__(threads_block_width *threads_block_width, \
-                    CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-                        (threads_block_width * threads_block_width * thread_num_registers))
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-        (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
-    CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
deleted file mode 100644
index 95ad7599cf1..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA split kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#define __SPLIT_KERNEL__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-#include "kernel/split/kernel_path_init.h"
-#include "kernel/split/kernel_scene_intersect.h"
-#include "kernel/split/kernel_lamp_emission.h"
-#include "kernel/split/kernel_do_volume.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-#include "kernel/split/kernel_indirect_background.h"
-#include "kernel/split/kernel_shader_setup.h"
-#include "kernel/split/kernel_shader_sort.h"
-#include "kernel/split/kernel_shader_eval.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-#include "kernel/split/kernel_direct_lighting.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
-#include "kernel/split/kernel_buffer_update.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#include "kernel/kernel_film.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
-{
-	*size = split_data_buffer_size(NULL, num_threads);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace_data_init(
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,
-        int queuesize,
-        ccl_global char *use_queues_flag,
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-        ccl_global float *buffer)
-{
-	kernel_data_init(NULL,
-	                 NULL,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		kernel_##name(NULL); \
-	}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		ccl_local type locals; \
-		kernel_##name(NULL, &locals); \
-	}
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
deleted file mode 100644
index 996bc27f71b..00000000000
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL kernel entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-__kernel void kernel_ocl_filter_divide_shadow(int sample,
-                                              CCL_FILTER_TILE_INFO,
-                                              ccl_global float *unfilteredA,
-                                              ccl_global float *unfilteredB,
-                                              ccl_global float *sampleVariance,
-                                              ccl_global float *sampleVarianceV,
-                                              ccl_global float *bufferVariance,
-                                              int4 prefilter_rect,
-                                              int buffer_pass_stride,
-                                              int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            CCL_FILTER_TILE_INFO_ARG,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_get_feature(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int m_offset,
-                                            int v_offset,
-                                            ccl_global float *mean,
-                                            ccl_global float *variance,
-                                            float scale,
-                                            int4 prefilter_rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          CCL_FILTER_TILE_INFO_ARG,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_write_feature(int sample,
-                                              int4 buffer_params,
-                                              int4 filter_area,
-                                              ccl_global float *from,
-                                              ccl_global float *buffer,
-                                              int out_offset,
-                                              int4 prefilter_rect)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-		                            x + filter_area.x,
-		                            y + filter_area.y,
-		                            buffer_params,
-		                            from,
-		                            buffer,
-		                            out_offset,
-		                            prefilter_rect);
-	}
-}
-
-__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
-                                                ccl_global float *variance,
-                                                ccl_global float *depth,
-                                                ccl_global float *output,
-                                                int4 prefilter_rect,
-                                                int pass_stride)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
-                                               ccl_global float *variance,
-                                               ccl_global float *a,
-                                               ccl_global float *b,
-                                               int4 prefilter_rect,
-                                               int r)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                    CCL_FILTER_TILE_INFO,
-                                                    ccl_global float *transform,
-                                                    ccl_global int *rank,
-                                                    int4 filter_area,
-                                                    int4 rect,
-                                                    int pass_stride,
-                                                    int frame_stride,
-                                                    char use_time,
-                                                    int radius,
-                                                    float pca_threshold)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		ccl_global int *l_rank = rank + y*filter_area.z + x;
-		ccl_global float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  CCL_FILTER_TILE_INFO_ARG,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
-                                                    const ccl_global float *ccl_restrict variance_image,
-                                                    const ccl_global float *ccl_restrict scale_image,
-                                                    ccl_global float *difference_image,
-                                                    int w,
-                                                    int h,
-                                                    int stride,
-                                                    int pass_stride,
-                                                    int r,
-                                                    int channel_offset,
-                                                    int frame_offset,
-                                                    float a,
-                                                    float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
-                                         ccl_global float *out_image,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
-                                                ccl_global float *out_image,
-                                                int w,
-                                                int h,
-                                                int stride,
-                                                int pass_stride,
-                                                int r,
-                                                int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
-                                                  const ccl_global float *ccl_restrict image,
-                                                  ccl_global float *out_image,
-                                                  ccl_global float *accum_image,
-                                                  int w,
-                                                  int h,
-                                                  int stride,
-                                                  int pass_stride,
-                                                  int channel_offset,
-                                                  int r,
-                                                  int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
-                                              const ccl_global float *ccl_restrict accum_image,
-                                              int w,
-                                              int h,
-                                              int stride)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
-                                                      const ccl_global float *ccl_restrict difference_image,
-                                                      const ccl_global float *ccl_restrict buffer,
-                                                      const ccl_global float *ccl_restrict transform,
-                                                      ccl_global int *rank,
-                                                      ccl_global float *XtWX,
-                                                      ccl_global float3 *XtWY,
-                                                      int4 filter_window,
-                                                      int w,
-                                                      int h,
-                                                      int stride,
-                                                      int pass_stride,
-                                                      int r,
-                                                      int f,
-                                                      int frame_offset,
-                                                      char use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
-                                         ccl_global int *rank,
-                                         ccl_global float *XtWX,
-                                         ccl_global float3 *XtWY,
-                                         int4 filter_area,
-                                         int4 buffer_params,
-                                         int sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
deleted file mode 100644
index ebdb99d4730..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#define KERNEL_NAME adaptive_adjust_samples
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
deleted file mode 100644
index 76d82d4184e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-
-#define KERNEL_NAME adaptive_filter_x
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
deleted file mode 100644
index 1e6d15ba0f2..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-
-#define KERNEL_NAME adaptive_filter_y
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
deleted file mode 100644
index 51de0059667..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-
-#define KERNEL_NAME adaptive_stopping
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_background.cl
deleted file mode 100644
index 0e600676e82..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background.cl
+++ /dev/null
@@ -1,35 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_background(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_background_evaluate(kg, input, output, x);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl b/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
deleted file mode 100644
index 7b81e387467..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_bake(
-	ccl_constant KernelData *data,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int sx, int sy, int sw, int sh, int offset, int stride, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh) {
-#ifndef __NO_BAKING__
-		kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#endif
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_base.cl b/intern/cycles/kernel/kernels/opencl/kernel_base.cl
deleted file mode 100644
index 1c2d89e8a92..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_base.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL base kernels entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-
-#include "kernel/kernel_film.h"
-
-
-__kernel void kernel_ocl_convert_to_byte(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_convert_to_half_float(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
-{
-	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-	if(i < size / sizeof(float4)) {
-		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	}
-	else if(i == size / sizeof(float4)) {
-		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
-
-		for(i = 0; i < size % sizeof(float4); i++) {
-			*(b++) = 0;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
deleted file mode 100644
index dcea2630aef..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_buffer_update.h"
-
-#define KERNEL_NAME buffer_update
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
deleted file mode 100644
index 7125348a49f..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-
-__kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-		KERNEL_BUFFER_PARAMS,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-        int queuesize,                               /* size (capacity) of the queue */
-        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-        ccl_global float *buffer)
-{
-	kernel_data_init((KernelGlobals*)kg,
-	                 data,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 KERNEL_BUFFER_ARGS,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
deleted file mode 100644
index ed64ae01aae..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_direct_lighting.h"
-
-#define KERNEL_NAME direct_lighting
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl b/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
deleted file mode 100644
index 76cc36971f5..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_displace(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_displace_evaluate(kg, input, output, x);
-	}
-}
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
deleted file mode 100644
index e68d4104a91..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
-
-#define KERNEL_NAME enqueue_inactive
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
deleted file mode 100644
index 9e1e57beba6..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-
-#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
-#define LOCALS_TYPE BackgroundAOLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
deleted file mode 100644
index 192d01444ba..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_background.h"
-
-#define KERNEL_NAME indirect_background
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
deleted file mode 100644
index 84938b889e5..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
-
-#define KERNEL_NAME indirect_subsurface
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
deleted file mode 100644
index c314dc96c33..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_lamp_emission.h"
-
-#define KERNEL_NAME lamp_emission
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
deleted file mode 100644
index 8b1332bf013..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-
-#define KERNEL_NAME next_iteration_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
deleted file mode 100644
index bb6b8a40e8e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright 2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_NANOVDB
-/* Data type to replace `double` used in the NanoVDB headers. Cycles don't need doubles, and is
- * safer and more portable to never use double datatype on GPU.
- * Use a special structure, so that the following is true:
- * - No unnoticed implicit cast or mathematical operations used on scalar 64bit type
- *   (which rules out trick like using `uint64_t` as a drop-in replacement for double).
- * - Padding rules are matching exactly `double`
- *   (which rules out array of `uint8_t`). */
-typedef struct ccl_vdb_double_t {
-  uint64_t i;
-} ccl_vdb_double_t;
-
-#  define double ccl_vdb_double_t
-#  include "nanovdb/CNanoVDB.h"
-#  undef double
-#endif
-
-/* For OpenCL we do manual lookup and interpolation. */
-
-ccl_device_inline ccl_global TextureInfo *kernel_tex_info(KernelGlobals *kg, uint id)
-{
-  const uint tex_offset = id
-#define KERNEL_TEX(type, name) +1
-#include "kernel/kernel_textures.h"
-      ;
-
-  return &((ccl_global TextureInfo *)kg->buffers[0])[tex_offset];
-}
-
-#define tex_fetch(type, info, index) \
-  ((ccl_global type *)(kg->buffers[info->cl_buffer] + info->data))[(index)]
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
-  x %= width;
-  if (x < 0)
-    x += width;
-  return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
-  return clamp(x, 0, width - 1);
-}
-
-ccl_device_inline float4 svm_image_texture_read(
-    KernelGlobals *kg, const ccl_global TextureInfo *info, void *acc, int x, int y, int z)
-{
-  const int data_offset = x + info->width * y + info->width * info->height * z;
-  const int texture_type = info->data_type;
-
-  /* Float4 */
-  if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
-    return tex_fetch(float4, info, data_offset);
-  }
-  /* Byte4 */
-  else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
-    uchar4 r = tex_fetch(uchar4, info, data_offset);
-    float f = 1.0f / 255.0f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Ushort4 */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
-    ushort4 r = tex_fetch(ushort4, info, data_offset);
-    float f = 1.0f / 65535.f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Float */
-  else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
-    float f = tex_fetch(float, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* UShort */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
-    ushort r = tex_fetch(ushort, info, data_offset);
-    float f = r * (1.0f / 65535.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-#ifdef WITH_NANOVDB
-  /* NanoVDB Float */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    float f = cnanovdb_readaccessor_getValueF((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* NanoVDB Float3 */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    cnanovdb_Vec3F f = cnanovdb_readaccessor_getValueF3((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f.mVec[0], f.mVec[1], f.mVec[2], 1.0f);
-  }
-#endif
-#ifdef __KERNEL_CL_KHR_FP16__
-  /* Half and Half4 are optional in OpenCL */
-  else if (texture_type == IMAGE_DATA_TYPE_HALF) {
-    float f = tex_fetch(half, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
-    half4 r = tex_fetch(half4, info, data_offset);
-    return make_float4(r.x, r.y, r.z, r.w);
-  }
-#endif
-  /* Byte */
-  else {
-    uchar r = tex_fetch(uchar, info, data_offset);
-    float f = r * (1.0f / 255.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-}
-
-ccl_device_inline float4
-svm_image_texture_read_2d(KernelGlobals *kg, int id, void *acc, int x, int y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, 0);
-}
-
-ccl_device_inline float4
-svm_image_texture_read_3d(KernelGlobals *kg, int id, void *acc, int x, int y, int z)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-      z = svm_image_texture_wrap_periodic(z, info->depth);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-      z = svm_image_texture_wrap_clamp(z, info->depth);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, z);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
-  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
-  *ix = i;
-  return x - (float)i;
-}
-
-#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
-  { \
-    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
-    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
-    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
-    u[3] = (1.0f / 6.0f) * t * t * t; \
-  } \
-  (void)0
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->extension == EXTENSION_CLIP) {
-    if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    }
-  }
-
-  if (info->interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy;
-    svm_image_texture_frac(x * info->width, &ix);
-    svm_image_texture_frac(y * info->height, &iy);
-
-    return svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-  }
-  else if (info->interpolation == INTERPOLATION_LINEAR) {
-    /* Bilinear interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float4 r;
-    r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-    r += (1.0f - ty) * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy);
-    r += ty * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy + 1);
-    r += ty * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy + 1);
-    return r;
-  }
-  else {
-    /* Bicubic interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float u[4], v[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int y = 0; y < 4; y++) {
-      for (int x = 0; x < 4; x++) {
-        float weight = u[x] * v[y];
-        r += weight * svm_image_texture_read_2d(kg, id, NULL, ix + x - 1, iy + y - 1);
-      }
-    }
-    return r;
-  }
-}
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->use_transform_3d) {
-    Transform tfm = info->transform_3d;
-    P = transform_point(&tfm, P);
-  }
-
-  float x = P.x;
-  float y = P.y;
-  float z = P.z;
-
-  uint interpolation = (interp == INTERPOLATION_NONE) ? info->interpolation : interp;
-
-#ifdef WITH_NANOVDB
-  cnanovdb_readaccessor acc;
-  if (info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
-      info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    ccl_global cnanovdb_griddata *grid =
-        (ccl_global cnanovdb_griddata *)(kg->buffers[info->cl_buffer] + info->data);
-    cnanovdb_readaccessor_init(&acc, cnanovdb_treedata_rootF(cnanovdb_griddata_tree(grid)));
-  }
-  else {
-    if (info->extension == EXTENSION_CLIP) {
-      if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
-        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-      }
-    }
-
-    x *= info->width;
-    y *= info->height;
-    z *= info->depth;
-  }
-#  define NANOVDB_ACCESS_POINTER &acc
-#else
-#  define NANOVDB_ACCESS_POINTER NULL
-#endif
-
-  if (interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy, iz;
-    svm_image_texture_frac(x, &ix);
-    svm_image_texture_frac(y, &iy);
-    svm_image_texture_frac(z, &iz);
-
-    return svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-  }
-  else if (interpolation == INTERPOLATION_LINEAR) {
-    /* Trilinear interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float4 r;
-    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
-        svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-    r += (1.0f - tz) * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz);
-    r += (1.0f - tz) * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz);
-    r += (1.0f - tz) * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz);
-
-    r += tz * (1.0f - ty) * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz + 1);
-    r += tz * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz + 1);
-    r += tz * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz + 1);
-    r += tz * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz + 1);
-    return r;
-  }
-  else {
-    /* Tricubic interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float u[4], v[4], w[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int z = 0; z < 4; z++) {
-      for (int y = 0; y < 4; y++) {
-        for (int x = 0; x < 4; x++) {
-          float weight = u[x] * v[y] * w[z];
-          r += weight * svm_image_texture_read_3d(
-                            kg, id, NANOVDB_ACCESS_POINTER, ix + x - 1, iy + y - 1, iz + z - 1);
-        }
-      }
-    }
-    return r;
-  }
-#undef NANOVDB_ACCESS_POINTER
-}
-
-#undef SET_CUBIC_SPLINE_WEIGHTS
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
deleted file mode 100644
index fa210e747c0..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_path_init.h"
-
-#define KERNEL_NAME path_init
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
deleted file mode 100644
index 68ee6f1d536..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-
-#define KERNEL_NAME queue_enqueue
-#define LOCALS_TYPE QueueEnqueueLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
deleted file mode 100644
index 10d09377ba9..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_scene_intersect.h"
-
-#define KERNEL_NAME scene_intersect
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
deleted file mode 100644
index 40eaa561863..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_eval.h"
-
-#define KERNEL_NAME shader_eval
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
deleted file mode 100644
index 8c36100f762..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_setup.h"
-
-#define KERNEL_NAME shader_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
deleted file mode 100644
index bcacaa4a054..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_sort.h"
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-#define KERNEL_NAME shader_sort
-#define LOCALS_TYPE ShaderSortLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
deleted file mode 100644
index 8de250a375c..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-
-#define KERNEL_NAME shadow_blocked_ao
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
deleted file mode 100644
index 29da77022ed..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-
-#define KERNEL_NAME shadow_blocked_dl
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
deleted file mode 100644
index c3b7b09460a..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
-#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
-
-#include "kernel/kernels/opencl/kernel_data_init.cl"
-#include "kernel/kernels/opencl/kernel_path_init.cl"
-#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
-#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
-#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
-#include "kernel/kernels/opencl/kernel_shader_setup.cl"
-#include "kernel/kernels/opencl/kernel_shader_sort.cl"
-#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
-#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
-#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
-#include "kernel/kernels/opencl/kernel_buffer_update.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
deleted file mode 100644
index e123b4cd6ec..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define KERNEL_NAME_JOIN(a, b) a##_##b
-#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
-
-__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace,
-                               KERNEL_NAME)(ccl_global char *kg_global,
-                                            ccl_constant KernelData *data,
-
-                                            ccl_global void *split_data_buffer,
-                                            ccl_global char *ray_state,
-
-                                            KERNEL_BUFFER_PARAMS,
-
-                                            ccl_global int *queue_index,
-                                            ccl_global char *use_queues_flag,
-                                            ccl_global unsigned int *work_pools,
-                                            ccl_global float *buffer)
-{
-#ifdef LOCALS_TYPE
-  ccl_local LOCALS_TYPE locals;
-#endif
-
-  KernelGlobals *kg = (KernelGlobals *)kg_global;
-
-  if (ccl_local_id(0) + ccl_local_id(1) == 0) {
-    kg->data = data;
-
-    kernel_split_params.queue_index = queue_index;
-    kernel_split_params.use_queues_flag = use_queues_flag;
-    kernel_split_params.work_pools = work_pools;
-    kernel_split_params.tile.buffer = buffer;
-
-    split_data_init(kg,
-                    &kernel_split_state,
-                    ccl_global_size(0) * ccl_global_size(1),
-                    split_data_buffer,
-                    ray_state);
-  }
-
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-
-  KERNEL_NAME_EVAL(kernel, KERNEL_NAME)
-  (kg
-#ifdef LOCALS_TYPE
-   ,
-   &locals
-#endif
-  );
-}
-
-#undef KERNEL_NAME_JOIN
-#undef KERNEL_NAME_EVAL
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
deleted file mode 100644
index 2b3be38df84..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-
-#define KERNEL_NAME subsurface_scatter
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 3f9de5ab33d..8e497986dcc 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
 // clang-format on
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 76a2e41abfa..a2f9d3f759a 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index b78dc8a3a67..812c3b6e71b 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index d656723bac2..80dfbee879e 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index c5ca8616fbd..5d968ed85e0 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,7 +32,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
@@ -50,45 +50,30 @@ CCL_NAMESPACE_BEGIN
 
 using namespace OSL;
 
-static ustring u_cubic("cubic");
-static ustring u_gaussian("gaussian");
-static ustring u_burley("burley");
-static ustring u_principled("principled");
+static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
 static ustring u_random_walk("random_walk");
-static ustring u_principled_random_walk("principled_random_walk");
 
 class CBSSRDFClosure : public CClosurePrimitive {
  public:
   Bssrdf params;
+  float ior;
   ustring method;
 
   CBSSRDFClosure()
   {
-    params.texture_blur = 0.0f;
-    params.sharpness = 0.0f;
-    params.roughness = 0.0f;
+    params.roughness = FLT_MAX;
+    params.anisotropy = 1.0f;
+    ior = 1.4f;
   }
 
   void setup(ShaderData *sd, int path_flag, float3 weight)
   {
-    if (method == u_cubic) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
-    }
-    else if (method == u_gaussian) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
-    }
-    else if (method == u_burley) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
-    }
-    else if (method == u_principled) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+    if (method == u_random_walk_fixed_radius) {
+      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
     }
     else if (method == u_random_walk) {
       alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID);
     }
-    else if (method == u_principled_random_walk) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
-    }
   }
 
   void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
@@ -106,11 +91,10 @@ class CBSSRDFClosure : public CClosurePrimitive {
       /* create one closure per color channel */
       bssrdf->radius = params.radius;
       bssrdf->albedo = params.albedo;
-      bssrdf->texture_blur = params.texture_blur;
-      bssrdf->sharpness = params.sharpness;
       bssrdf->N = params.N;
       bssrdf->roughness = params.roughness;
-      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+      bssrdf->anisotropy = clamp(params.anisotropy, 0.0f, 0.9f);
+      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, clamp(ior, 1.01f, 3.8f));
     }
   }
 };
@@ -122,9 +106,9 @@ ClosureParam *closure_bssrdf_params()
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"),
       CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, ior, "ior"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.anisotropy, "anisotropy"),
       CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"),
       CLOSURE_FINISH_PARAM(CBSSRDFClosure)};
   return params;
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 7ee467a46dd..e814fcca246 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -40,10 +40,10 @@
 #include "util/util_param.h"
 
 // clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_types.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_random.h"
 
@@ -500,7 +500,7 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
 {
   /* caustic options */
   if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
 
     if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
         (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 2b7c21d0bc4..396f42080e4 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -40,22 +40,22 @@
 #include "util/util_string.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_projection.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/image.h"
+
 #include "kernel/kernel_differential.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_camera.h"
-#include "kernel/kernels/cpu/kernel_cpu_image.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+
 #include "kernel/geom/geom.h"
 #include "kernel/bvh/bvh.h"
 
+#include "kernel/kernel_color.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
 #include "kernel/kernel_projection.h"
-#include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 // clang-format on
 
@@ -147,7 +147,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -155,18 +155,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
       Transform tfm;
 
       if (time == sd->time)
-        tfm = sd->ob_tfm;
+        tfm = object_get_transform(kg, sd);
       else
         tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
 #else
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+      const Transform tfm = object_get_transform(kg, sd);
 #endif
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -184,7 +185,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -192,18 +193,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
       Transform itfm;
 
       if (time == sd->time)
-        itfm = sd->ob_itfm;
+        itfm = object_get_inverse_transform(kg, sd);
       else
         object_fetch_transform_motion_test(kg, object, time, &itfm);
 #else
-      Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+      const Transform itfm = object_get_inverse_transform(kg, sd);
 #endif
       copy_matrix(result, itfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -218,7 +220,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -250,7 +252,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -284,21 +286,18 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_tfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#endif
+      const Transform tfm = object_get_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -315,21 +314,18 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_itfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-#endif
+      const Transform tfm = object_get_inverse_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -341,7 +337,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -368,7 +364,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            ustring to)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -747,7 +743,7 @@ static bool set_attribute_matrix(const Transform &tfm, TypeDesc type, void *val)
   return false;
 }
 
-static bool get_primitive_attribute(KernelGlobals *kg,
+static bool get_primitive_attribute(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     const OSLGlobals::Attribute &attr,
                                     const TypeDesc &type,
@@ -808,7 +804,7 @@ static bool get_primitive_attribute(KernelGlobals *kg,
   }
 }
 
-static bool get_mesh_attribute(KernelGlobals *kg,
+static bool get_mesh_attribute(const KernelGlobals *kg,
                                const ShaderData *sd,
                                const OSLGlobals::Attribute &attr,
                                const TypeDesc &type,
@@ -857,8 +853,12 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
   }
 }
 
-bool OSLRenderServices::get_object_standard_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_object_standard_attribute(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      ustring name,
+                                                      TypeDesc type,
+                                                      bool derivatives,
+                                                      void *val)
 {
   /* todo: turn this into hash table? */
 
@@ -988,8 +988,12 @@ bool OSLRenderServices::get_object_standard_attribute(
     return false;
 }
 
-bool OSLRenderServices::get_background_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_background_attribute(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 ustring name,
+                                                 TypeDesc type,
+                                                 bool derivatives,
+                                                 void *val)
 {
   if (name == u_path_ray_length) {
     /* Ray Length */
@@ -998,38 +1002,32 @@ bool OSLRenderServices::get_background_attribute(
   }
   else if (name == u_path_ray_depth) {
     /* Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_diffuse_depth) {
     /* Diffuse Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->diffuse_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.diffuse_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_glossy_depth) {
     /* Glossy Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->glossy_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.glossy_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transmission_depth) {
     /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transmission_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transparent_depth) {
     /* Transparent Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transparent_bounce;
-    return set_attribute_int(f, type, derivatives, val);
-  }
-  else if (name == u_path_transmission_depth) {
-    /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transparent_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_ndc) {
@@ -1043,8 +1041,10 @@ bool OSLRenderServices::get_background_attribute(
       ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);
 
       if (derivatives) {
-        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx) - ndc[0];
-        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy) - ndc[0];
+        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) -
+                 ndc[0];
+        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) -
+                 ndc[0];
       }
     }
     else {
@@ -1079,7 +1079,7 @@ bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_attribute(
     ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val)
 {
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
   int prim_type = 0;
   int object;
 
@@ -1208,17 +1208,17 @@ bool OSLRenderServices::texture(ustring filename,
   OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
   OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kernel_globals = sd->osl_globals;
+  const KernelGlobals *kernel_globals = sd->osl_globals;
   bool status = false;
 
   switch (texture_type) {
     case OSLTextureHandle::BEVEL: {
       /* Bevel shader hack. */
       if (nchannels >= 3) {
-        PathState *state = sd->osl_path_state;
+        const IntegratorStateCPU *state = sd->osl_path_state;
         int num_samples = (int)s;
         float radius = t;
-        float3 N = svm_bevel(kernel_globals, sd, state, radius, num_samples);
+        float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
         result[0] = N.x;
         result[1] = N.y;
         result[2] = N.z;
@@ -1228,7 +1228,7 @@ bool OSLRenderServices::texture(ustring filename,
     }
     case OSLTextureHandle::AO: {
       /* AO shader hack. */
-      PathState *state = sd->osl_path_state;
+      const IntegratorStateCPU *state = sd->osl_path_state;
       int num_samples = (int)s;
       float radius = t;
       float3 N = make_float3(dsdx, dtdx, dsdy);
@@ -1242,7 +1242,7 @@ bool OSLRenderServices::texture(ustring filename,
       if ((int)options.tblur) {
         flags |= NODE_AO_GLOBAL_RADIUS;
       }
-      result[0] = svm_ao(kernel_globals, sd, N, state, radius, num_samples, flags);
+      result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
       status = true;
       break;
     }
@@ -1355,7 +1355,7 @@ bool OSLRenderServices::texture3d(ustring filename,
     case OSLTextureHandle::SVM: {
       /* Packed texture. */
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       int slot = handle->svm_slot;
       float3 P_float3 = make_float3(P.x, P.y, P.z);
       float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
@@ -1377,7 +1377,7 @@ bool OSLRenderServices::texture3d(ustring filename,
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == NULL) {
           ShaderData *sd = (ShaderData *)(sg->renderstate);
-          KernelGlobals *kernel_globals = sd->osl_globals;
+          const KernelGlobals *kernel_globals = sd->osl_globals;
           OSLThreadData *tdata = kernel_globals->osl_tdata;
           texture_thread_info = tdata->oiio_thread_info;
         }
@@ -1462,7 +1462,7 @@ bool OSLRenderServices::environment(ustring filename,
   if (handle && handle->oiio_handle) {
     if (thread_info == NULL) {
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       OSLThreadData *tdata = kernel_globals->osl_tdata;
       thread_info = tdata->oiio_thread_info;
     }
@@ -1600,10 +1600,14 @@ bool OSLRenderServices::trace(TraceOpt &options,
   }
 
   /* ray differentials */
-  ray.dP.dx = TO_FLOAT3(dPdx);
-  ray.dP.dy = TO_FLOAT3(dPdy);
-  ray.dD.dx = TO_FLOAT3(dRdx);
-  ray.dD.dy = TO_FLOAT3(dRdy);
+  differential3 dP;
+  dP.dx = TO_FLOAT3(dPdx);
+  dP.dy = TO_FLOAT3(dPdy);
+  ray.dP = differential_make_compact(dP);
+  differential3 dD;
+  dD.dx = TO_FLOAT3(dRdx);
+  dD.dy = TO_FLOAT3(dRdy);
+  ray.dD = differential_make_compact(dD);
 
   /* allocate trace data */
   OSLTraceData *tracedata = (OSLTraceData *)sg->tracedata;
@@ -1613,7 +1617,7 @@ bool OSLRenderServices::trace(TraceOpt &options,
   tracedata->hit = false;
   tracedata->sd.osl_globals = sd->osl_globals;
 
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   /* Can't raytrace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
@@ -1646,11 +1650,11 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
       }
       else {
         ShaderData *sd = &tracedata->sd;
-        KernelGlobals *kg = sd->osl_globals;
+        const KernelGlobals *kg = sd->osl_globals;
 
         if (!tracedata->setup) {
           /* lazy shader data setup */
-          shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray);
+          shader_setup_from_ray(kg, sd, &tracedata->ray, &tracedata->isect);
           tracedata->setup = true;
         }
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 891b9172dd4..58accb46e7d 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -250,10 +250,18 @@ class OSLRenderServices : public OSL::RendererServices {
                         void *data) override;
 #endif
 
-  static bool get_background_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
-  static bool get_object_standard_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
+  static bool get_background_attribute(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       ustring name,
+                                       TypeDesc type,
+                                       bool derivatives,
+                                       void *val);
+  static bool get_object_standard_attribute(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            ustring name,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            void *val);
 
   static ustring u_distance;
   static ustring u_index;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 389c854c495..880ef635c76 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -17,14 +17,16 @@
 #include <OSL/oslexec.h>
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 
 #include "kernel/geom/geom_object.h"
 
+#include "kernel/integrator/integrator_state.h"
+
 #include "kernel/osl/osl_closures.h"
 #include "kernel/osl/osl_globals.h"
 #include "kernel/osl/osl_services.h"
@@ -39,9 +41,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Threads */
 
-void OSLShader::thread_init(KernelGlobals *kg,
-                            KernelGlobals *kernel_globals,
-                            OSLGlobals *osl_globals)
+void OSLShader::thread_init(KernelGlobals *kg, OSLGlobals *osl_globals)
 {
   /* no osl used? */
   if (!osl_globals->use) {
@@ -87,8 +87,11 @@ void OSLShader::thread_free(KernelGlobals *kg)
 
 /* Globals */
 
-static void shaderdata_to_shaderglobals(
-    KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, OSLThreadData *tdata)
+static void shaderdata_to_shaderglobals(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        const IntegratorStateCPU *state,
+                                        int path_flag,
+                                        OSLThreadData *tdata)
 {
   OSL::ShaderGlobals *globals = &tdata->globals;
 
@@ -171,7 +174,10 @@ static void flatten_surface_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_surface(const KernelGlobals *kg,
+                             const IntegratorStateCPU *state,
+                             ShaderData *sd,
+                             int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -276,7 +282,10 @@ static void flatten_background_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_background(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd,
+                                int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -331,7 +340,10 @@ static void flatten_volume_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_volume(const KernelGlobals *kg,
+                            const IntegratorStateCPU *state,
+                            ShaderData *sd,
+                            int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -354,7 +366,9 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
+void OSLShader::eval_displacement(const KernelGlobals *kg,
+                                  const IntegratorStateCPU *state,
+                                  ShaderData *sd)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -377,7 +391,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *
 
 /* Attributes */
 
-int OSLShader::find_attribute(KernelGlobals *kg,
+int OSLShader::find_attribute(const KernelGlobals *kg,
                               const ShaderData *sd,
                               uint id,
                               AttributeDescriptor *desc)
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index a4fa24d0a90..f1f17b141eb 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -37,6 +37,7 @@ class Scene;
 
 struct ShaderClosure;
 struct ShaderData;
+struct IntegratorStateCPU;
 struct differential3;
 struct KernelGlobals;
 
@@ -49,19 +50,28 @@ class OSLShader {
   static void register_closures(OSLShadingSystem *ss);
 
   /* per thread data */
-  static void thread_init(KernelGlobals *kg,
-                          KernelGlobals *kernel_globals,
-                          OSLGlobals *osl_globals);
+  static void thread_init(KernelGlobals *kg, OSLGlobals *osl_globals);
   static void thread_free(KernelGlobals *kg);
 
   /* eval */
-  static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
+  static void eval_surface(const KernelGlobals *kg,
+                           const IntegratorStateCPU *state,
+                           ShaderData *sd,
+                           int path_flag);
+  static void eval_background(const KernelGlobals *kg,
+                              const IntegratorStateCPU *state,
+                              ShaderData *sd,
+                              int path_flag);
+  static void eval_volume(const KernelGlobals *kg,
+                          const IntegratorStateCPU *state,
+                          ShaderData *sd,
+                          int path_flag);
+  static void eval_displacement(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd);
 
   /* attributes */
-  static int find_attribute(KernelGlobals *kg,
+  static int find_attribute(const KernelGlobals *kg,
                             const ShaderData *sd,
                             uint id,
                             AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 23949f406c7..55afb892d36 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -18,11 +18,13 @@
 #include "stdcycles.h"
 
 shader node_principled_bsdf(string distribution = "Multiscatter GGX",
-                            string subsurface_method = "burley",
+                            string subsurface_method = "random_walk",
                             color BaseColor = color(0.8, 0.8, 0.8),
                             float Subsurface = 0.0,
                             vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
                             color SubsurfaceColor = color(0.7, 0.1, 0.1),
+                            float SubsurfaceIOR = 1.4,
+                            float SubsurfaceAnisotropy = 0.0,
                             float Metallic = 0.0,
                             float Specular = 0.5,
                             float SpecularTint = 0.0,
@@ -59,22 +61,17 @@ shader node_principled_bsdf(string distribution = "Multiscatter GGX",
   if (diffuse_weight > 1e-5) {
     if (Subsurface > 1e-5) {
       color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
-      if (subsurface_method == "burley") {
-        BSDF = mixed_ss_base_color * bssrdf("principled",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            SubsurfaceColor,
-                                            "roughness",
-                                            Roughness);
-      }
-      else {
-        BSDF = mixed_ss_base_color * bssrdf("principled_random_walk",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            mixed_ss_base_color,
-                                            "roughness",
-                                            Roughness);
-      }
+
+      BSDF = mixed_ss_base_color * bssrdf(subsurface_method,
+                                          Normal,
+                                          Subsurface * SubsurfaceRadius,
+                                          mixed_ss_base_color,
+                                          "roughness",
+                                          Roughness,
+                                          "ior",
+                                          SubsurfaceIOR,
+                                          "anisotropy",
+                                          SubsurfaceAnisotropy);
     }
     else {
       BSDF = BaseColor * principled_diffuse(Normal, Roughness);
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index b1e854150ab..f55e38c54ff 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -19,27 +19,12 @@
 shader node_subsurface_scattering(color Color = 0.8,
                                   float Scale = 1.0,
                                   vector Radius = vector(0.1, 0.1, 0.1),
-                                  float TextureBlur = 0.0,
-                                  float Sharpness = 0.0,
-                                  string falloff = "cubic",
+                                  float IOR = 1.4,
+                                  float Anisotropy = 0.0,
+                                  string method = "random_walk",
                                   normal Normal = N,
                                   output closure color BSSRDF = 0)
 {
-  if (falloff == "gaussian")
-    BSSRDF = Color *
-             bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else if (falloff == "cubic")
-    BSSRDF = Color * bssrdf("cubic",
-                            Normal,
-                            Scale * Radius,
-                            Color,
-                            "texture_blur",
-                            TextureBlur,
-                            "sharpness",
-                            Sharpness);
-  else if (falloff == "burley")
-    BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else
-    BSSRDF = Color *
-             bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
+  BSSRDF = Color *
+           bssrdf(method, Normal, Scale * Radius, Color, "ior", IOR, "anisotropy", Anisotropy);
 }
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
deleted file mode 100644
index 437a5c9581b..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
-    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-      float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-      if (sample_multiplier != 1.0f) {
-        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-      }
-    }
-    else {
-      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
deleted file mode 100644
index 93f41f7ced4..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int y = kernel_split_params.tile.y + pixel_index;
-    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
deleted file mode 100644
index eca53d079ec..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index;
-    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
deleted file mode 100644
index c8eb1ebd705..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_stopping.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    kernel_do_adaptive_stopping(kg,
-                                buffer,
-                                kernel_split_params.tile.start_sample +
-                                    kernel_split_params.tile.num_samples - 1);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
deleted file mode 100644
index 45f5037d321..00000000000
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-/* sets up the various state needed to do an indirect loop */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg,
-                                                                     int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* save a copy of the state to restore later */
-#  define BRANCHED_STORE(name) branched_state->name = kernel_split_state.name[ray_index];
-
-  BRANCHED_STORE(path_state);
-  BRANCHED_STORE(throughput);
-  BRANCHED_STORE(ray);
-  BRANCHED_STORE(isect);
-  BRANCHED_STORE(ray_state);
-
-  *kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(branched_state_sd, ray_index)->closure[i] =
-        kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_STORE
-
-  /* Set loop counters to initial position. */
-  branched_state->next_closure = 0;
-  branched_state->next_sample = 0;
-}
-
-/* ends an indirect loop and restores the previous state */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg,
-                                                                    int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* restore state */
-#  define BRANCHED_RESTORE(name) kernel_split_state.name[ray_index] = branched_state->name;
-
-  BRANCHED_RESTORE(path_state);
-  BRANCHED_RESTORE(throughput);
-  BRANCHED_RESTORE(ray);
-  BRANCHED_RESTORE(isect);
-  BRANCHED_RESTORE(ray_state);
-
-  *kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, ray_index)->closure[i] =
-        kernel_split_sd(branched_state_sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_RESTORE
-
-  /* leave indirect loop */
-  REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
-}
-
-ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg,
-                                                                   int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
-                                       kernel_split_state.queue_data,
-                                       kernel_split_params.queue_size,
-                                       kernel_split_params.queue_index);
-
-  if (!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
-    return false;
-  }
-
-#  define SPLIT_DATA_ENTRY(type, name, num) \
-    if (num) { \
-      kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \
-    }
-  SPLIT_DATA_ENTRIES_BRANCHED_SHARED
-#  undef SPLIT_DATA_ENTRY
-
-  *kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-  kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
-  kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
-  kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
-
-  path_radiance_init(kg, inactive_L);
-  path_radiance_copy_indirect(inactive_L, L);
-
-  ray_state[inactive_ray] = RAY_REGENERATED;
-  ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
-  ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
-
-  atomic_fetch_and_inc_uint32(
-      (ccl_global uint *)&kernel_split_state.branched_state[ray_index].shared_sample_count);
-
-  return true;
-}
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
-    KernelGlobals *kg,
-    int ray_index,
-    float num_samples_adjust,
-    ShaderData *saved_sd,
-    bool reset_path_state,
-    bool wait_for_shared)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = saved_sd;
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  float3 throughput = branched_state->throughput;
-  ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-
-  float sum_sample_weight = 0.0f;
-#  ifdef __DENOISING_FEATURES__
-  if (ps->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#  endif /* __DENOISING_FEATURES__ */
-
-  for (int i = branched_state->next_closure; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSDF(sc->type))
-      continue;
-    /* transparency is not handled here, but in outer loop */
-    if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
-      continue;
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = branched_state->next_sample; j < num_samples; j++) {
-      if (reset_path_state) {
-        *ps = branched_state->path_state;
-      }
-
-      ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-      ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-      *tp = throughput;
-
-      ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, tp, ps, &L->state, bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps->rng_hash = branched_state->path_state.rng_hash;
-
-      /* update state for next iteration */
-      branched_state->next_closure = i;
-      branched_state->next_sample = j + 1;
-
-      /* start the indirect path */
-      *tp *= num_samples_inv;
-
-      if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-        continue;
-      }
-
-      return true;
-    }
-
-    branched_state->next_sample = 0;
-  }
-
-  branched_state->next_closure = sd->num_closure;
-
-  if (wait_for_shared) {
-    branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-    if (branched_state->waiting_on_shared_samples) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
deleted file mode 100644
index b96feca582f..00000000000
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of rays that hit the background (sceneintersect
- * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
- * accumulated radiance in the output buffer. This kernel also takes care of
- * rays that have been determined to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel.
- * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
- * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- */
-ccl_device void kernel_buffer_update(KernelGlobals *kg,
-                                     ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (ray_index == 0) {
-    /* We will empty this queue in this kernel. */
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-  char enqueue_flag = 0;
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    bool ray_was_updated = false;
-
-    if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-      ray_was_updated = true;
-      uint sample = state->sample;
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      /* accumulate result in output buffer */
-      kernel_write_result(kg, buffer, sample, L);
-
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-    }
-
-    if (kernel_data.film.cryptomatte_passes) {
-      /* Make sure no thread is writing to the buffers. */
-      ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      if (ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
-        uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
-        kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-      /* We have completed current work; So get next work */
-      ccl_global uint *work_pools = kernel_split_params.work_pools;
-      uint total_work_size = kernel_split_params.total_work_size;
-      uint work_index;
-
-      if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-        /* If work is invalid, this means no more work is available and the thread may exit */
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-      }
-
-      if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, work_index, &x, &y, &sample);
-
-        /* Store buffer offset for writing to passes. */
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-        /* Initialize random numbers and ray. */
-        uint rng_hash;
-        kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
-
-        if (ray->t != 0.0f) {
-          /* Initialize throughput, path radiance, Ray, PathState;
-           * These rays proceed with path-iteration.
-           */
-          *throughput = make_float3(1.0f, 1.0f, 1.0f);
-          path_radiance_init(kg, L);
-          path_state_init(kg,
-                          AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                          state,
-                          rng_hash,
-                          sample,
-                          ray);
-#ifdef __SUBSURFACE__
-          kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-          enqueue_flag = 1;
-        }
-        else {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-        }
-      }
-    }
-  }
-
-  /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-   * These rays will be made active during next SceneIntersectkernel.
-   */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
deleted file mode 100644
index 2f83a10316d..00000000000
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel Initializes structures needed in path-iteration kernels.
- *
- * Note on Queues:
- * All slots in queues are initialized to queue empty slot;
- * The number of elements in the queues is initialized to 0;
- */
-
-#ifndef __KERNEL_CPU__
-ccl_device void kernel_data_init(
-#else
-void KERNEL_FUNCTION_FULL_NAME(data_init)(
-#endif
-    KernelGlobals *kg,
-    ccl_constant KernelData *data,
-    ccl_global void *split_data_buffer,
-    int num_elements,
-    ccl_global char *ray_state,
-
-#ifdef __KERNEL_OPENCL__
-    KERNEL_BUFFER_PARAMS,
-#endif
-
-    int start_sample,
-    int end_sample,
-    int sx,
-    int sy,
-    int sw,
-    int sh,
-    int offset,
-    int stride,
-    ccl_global int *Queue_index,      /* Tracks the number of elements in queues */
-    int queuesize,                    /* size (capacity) of the queue */
-    ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues
-                                         to fetch ray index */
-    ccl_global unsigned int *work_pools, /* Work pool for each work group */
-    unsigned int num_samples,
-    ccl_global float *buffer)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, data_init);
-#else
-
-#  ifdef __KERNEL_OPENCL__
-  kg->data = data;
-#  endif
-
-  kernel_split_params.tile.x = sx;
-  kernel_split_params.tile.y = sy;
-  kernel_split_params.tile.w = sw;
-  kernel_split_params.tile.h = sh;
-
-  kernel_split_params.tile.start_sample = start_sample;
-  kernel_split_params.tile.num_samples = num_samples;
-
-  kernel_split_params.tile.offset = offset;
-  kernel_split_params.tile.stride = stride;
-
-  kernel_split_params.tile.buffer = buffer;
-
-  kernel_split_params.total_work_size = sw * sh * num_samples;
-
-  kernel_split_params.work_pools = work_pools;
-
-  kernel_split_params.queue_index = Queue_index;
-  kernel_split_params.queue_size = queuesize;
-  kernel_split_params.use_queues_flag = use_queues_flag;
-
-  split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
-
-#  ifdef __KERNEL_OPENCL__
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-  kernel_set_buffer_info(kg);
-#  endif
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  /* Initialize queue data and queue index. */
-  if (thread_index < queuesize) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-    }
-  }
-
-  if (thread_index == 0) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      Queue_index[i] = 0;
-    }
-
-    /* The scene-intersect kernel should not use the queues very first time.
-     * since the queue would be empty.
-     */
-    *use_queues_flag = 0;
-  }
-#endif /* KERENL_STUB */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
deleted file mode 100644
index 3be2b35812f..00000000000
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of direct lighting logic.
- * However, the "shadow ray cast" part of direct lighting is handled
- * in the next kernel.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with direct lighting should be executed. Those rays for which
- * a shadow_blocked() function for direct-lighting must be executed, are
- * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS
- *
- * Note on Queues:
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
- * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
- * the corresponding shadow_blocked part, after direct lighting, the ray is
- * marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called:
- * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
- *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
- *   kernel call.
- * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
- *   shadow_blocked function must be executed, after this kernel call
- *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
- */
-ccl_device void kernel_direct_lighting(KernelGlobals *kg,
-                                       ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  char enqueue_flag = 0;
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    /* direct lighting */
-#ifdef __EMISSION__
-    bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL));
-
-#  ifdef __BRANCHED_PATH__
-    if (flag && kernel_data.integrator.branched) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __BRANCHED_PATH__ */
-
-#  ifdef __SHADOW_TRICKS__
-    if (flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __SHADOW_TRICKS__ */
-
-    if (flag) {
-      /* Sample illumination from lights to find path contribution. */
-      float light_u, light_v;
-      path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-      float terminate = path_state_rng_light_termination(kg, state);
-
-      LightSample ls;
-      if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-        Ray light_ray;
-        light_ray.time = sd->time;
-
-        BsdfEval L_light;
-        bool is_lamp;
-        if (direct_emission(kg,
-                            sd,
-                            AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                            &ls,
-                            state,
-                            &light_ray,
-                            &L_light,
-                            &is_lamp,
-                            terminate)) {
-          /* Write intermediate data to global memory to access from
-           * the next kernel.
-           */
-          kernel_split_state.light_ray[ray_index] = light_ray;
-          kernel_split_state.bsdf_eval[ray_index] = L_light;
-          kernel_split_state.is_lamp[ray_index] = is_lamp;
-          /* Mark ray state for next shadow kernel. */
-          enqueue_flag = 1;
-        }
-      }
-    }
-#endif /* __EMISSION__ */
-  }
-
-#ifdef __EMISSION__
-  /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
-   * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
-   */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_LIGHT_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
deleted file mode 100644
index 1775e870f07..00000000000
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
-
-ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg,
-                                                                             int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg,
-                                                                               int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  /* GPU: no decoupled ray marching, scatter probabilistically. */
-  int num_samples = kernel_data.integrator.volume_samples;
-  float num_samples_inv = 1.0f / num_samples;
-
-  Ray volume_ray = branched_state->ray;
-  volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ?
-                     branched_state->isect.t :
-                     FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack);
-
-  for (int j = branched_state->next_sample; j < num_samples; j++) {
-    ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-    *ps = branched_state->path_state;
-
-    ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
-    *pray = branched_state->ray;
-
-    ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-    *tp = branched_state->throughput * num_samples_inv;
-
-    /* branch RNG state */
-    path_state_branch(ps, j, num_samples);
-
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, ps, sd, &volume_ray, L, tp, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
-
-      /* indirect light bounce */
-      if (!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
-        continue;
-      }
-
-      /* start the indirect path */
-      branched_state->next_closure = 0;
-      branched_state->next_sample = j + 1;
-
-      /* Attempting to share too many samples is slow for volumes as it causes us to
-       * loop here more and have many calls to kernel_volume_integrate which evaluates
-       * shaders. The many expensive shader evaluations cause the work load to become
-       * unbalanced and many threads to become idle in this kernel. Limiting the
-       * number of shared samples here helps quite a lot.
-       */
-      if (branched_state->shared_sample_count < 2) {
-        if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-          continue;
-        }
-      }
-
-      return true;
-    }
-#  endif
-  }
-
-  branched_state->next_sample = num_samples;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  /* todo: avoid this calculation using decoupled ray marching */
-  float3 throughput = kernel_split_state.throughput[ray_index];
-  kernel_volume_shadow(
-      kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
-  kernel_split_state.throughput[ray_index] = throughput;
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __VOLUME__ */
-
-ccl_device void kernel_do_volume(KernelGlobals *kg)
-{
-#ifdef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-#  ifdef __BRANCHED_PATH__
-    kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
-#  endif /* __BRANCHED_PATH__ */
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (*kernel_split_params.use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    bool hit = !IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
-
-    /* Sanitize volume stack. */
-    if (!hit) {
-      kernel_volume_clean_stack(kg, state->volume_stack);
-    }
-    /* volume attenuation, emission, scatter */
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      Ray volume_ray = *ray;
-      volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif /* __BRANCHED_PATH__ */
-        float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-        {
-          /* integrate along volume segment with distance sampling */
-          VolumeIntegrateResult result = kernel_volume_integrate(
-              kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-          if (result == VOLUME_PATH_SCATTERED) {
-            /* direct lighting */
-            kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-            /* indirect light bounce */
-            if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
-              ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-            }
-            else {
-              kernel_split_path_end(kg, ray_index);
-            }
-          }
-#  endif /* __VOLUME_SCATTER__ */
-        }
-
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif /* __BRANCHED_PATH__ */
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_VOLUME_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __VOLUME__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
deleted file mode 100644
index 745313f89f1..00000000000
--- a/intern/cycles/kernel/split/kernel_enqueue_inactive.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
-                                        ccl_local_param unsigned int *local_queue_atomics)
-{
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  char enqueue_flag = 0;
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
-    enqueue_flag = 1;
-  }
-
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_INACTIVE_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
deleted file mode 100644
index 61722840b0b..00000000000
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of the logic to process "material of type holdout",
- * indirect primitive emission, bsdf blurring, probabilistic path termination
- * and AO.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with AO should be executed. Those rays for which a
- * shadow_blocked() function for AO must be executed are marked with flag
- * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS
- *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
- *
- * Note on Queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
- *     flag RAY_SHADOW_RAY_CAST_AO
- */
-
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
-    KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    locals->queue_atomics_bg = 0;
-    locals->queue_atomics_ao = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-#ifdef __AO__
-  char enqueue_flag = 0;
-#endif
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global PathState *state = 0x0;
-    float3 throughput;
-
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-      ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-      PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-      throughput = kernel_split_state.throughput[ray_index];
-      state = &kernel_split_state.path_state[ray_index];
-
-      if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, buffer)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* Path termination. this is a strange place to put the termination, it's
-       * mainly due to the mixed in MIS that we use. gives too many unneeded
-       * shader evaluations, only need emission if we are going to terminate.
-       */
-      float probability = path_state_continuation_probability(kg, state, throughput);
-
-      if (probability == 0.0f) {
-        kernel_split_path_end(kg, ray_index);
-      }
-      else if (probability < 1.0f) {
-        float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-        if (terminate >= probability) {
-          kernel_split_path_end(kg, ray_index);
-        }
-        else {
-          kernel_split_state.throughput[ray_index] = throughput / probability;
-        }
-      }
-
-#ifdef __DENOISING_FEATURES__
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-        kernel_update_denoising_features(kg, sd, state, L);
-      }
-#endif
-    }
-
-#ifdef __AO__
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        enqueue_flag = 1;
-      }
-    }
-#endif /* __AO__ */
-  }
-
-#ifdef __AO__
-  /* Enqueue to-shadow-ray-cast rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          &locals->queue_atomics_ao,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
deleted file mode 100644
index 6d500650cc0..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_background(KernelGlobals *kg)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int ray_index;
-
-  if (kernel_data.integrator.ao_bounces != INT_MAX) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index != QUEUE_EMPTY_SLOT) {
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-        if (path_state_ao_bounce(kg, state)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-      }
-    }
-  }
-
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  if (IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    kernel_path_background(kg, state, ray, throughput, sd, buffer, L);
-    kernel_split_path_end(kg, ray_index);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
deleted file mode 100644
index 3f48f8d6f56..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index;
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-#ifdef __SUBSURFACE__
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect->num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, ss_indirect, state, ray, L, throughput);
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
deleted file mode 100644
index 7ecb099208d..00000000000
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
- * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- */
-ccl_device void kernel_lamp_emission(KernelGlobals *kg)
-{
-#ifndef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-  }
-#endif
-  /* Fetch use_queues_flag. */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-#ifndef __VOLUME__
-                              1
-#else
-                              0
-#endif
-    );
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
deleted file mode 100644
index 320f6a414bf..00000000000
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/*This kernel takes care of setting up ray for the next iteration of
- * path-iteration and accumulating radiance corresponding to AO and
- * direct-lighting
- *
- * Ray state of rays that are terminated in this kernel are changed
- * to RAY_UPDATE_BUFFER.
- *
- * Note on queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFF state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
- */
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
-}
-
-ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index)
-{
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-
-#  ifdef __VOLUME__
-  if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-    /* continue in case of transparency */
-    *throughput *= shader_bsdf_transparency(kg, sd);
-
-    if (is_zero(*throughput)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-
-    /* Update Path State */
-    path_state_next(kg, state, LABEL_TRANSPARENT);
-#  ifdef __VOLUME__
-  }
-  else {
-    if (!path_state_volume_next(kg, state)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-  }
-#  endif
-
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->t -= sd->ray_length; /* clipping works through transparent */
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD.dx = -sd->dI.dx;
-  ray->dD.dy = -sd->dI.dy;
-#  endif /* __RAY_DIFFERENTIALS__ */
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif /* __VOLUME__ */
-}
-#endif /* __BRANCHED_PATH__ */
-
-ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
-                                            ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    /* If we are here, then it means that scene-intersect kernel
-     * has already been executed at least once. From the next time,
-     * scene-intersect kernel may operate on queues to fetch ray index
-     */
-    *kernel_split_params.use_queues_flag = 1;
-
-    /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-     * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-     * previous kernel.
-     */
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __VOLUME__
-  /* Reactivate only volume rays here, most surface work was skipped. */
-  if (IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-  }
-#endif
-
-  bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
-  if (active) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
-    if (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-      /* Compute direct lighting and next bounce. */
-      if (!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-#ifdef __BRANCHED_PATH__
-    }
-    else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-    else {
-      kernel_split_branched_indirect_light_init(kg, ray_index);
-
-      if (kernel_split_branched_path_surface_indirect_light_iter(
-              kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-      }
-      else {
-        kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-        kernel_split_branched_transparent_bounce(kg, ray_index);
-      }
-    }
-#endif /* __BRANCHED_PATH__ */
-  }
-
-  /* Enqueue RAY_UPDATE_BUFFER rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-#ifdef __BRANCHED_PATH__
-  /* iter loop */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
-  }
-
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_LIGHT_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_reset_indirect(L);
-
-    if (kernel_split_branched_path_surface_indirect_light_iter(
-            kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-    else {
-      kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-  }
-
-#  ifdef __VOLUME__
-  /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_VOLUME_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#  endif /* __VOLUME__ */
-
-#  ifdef __SUBSURFACE__
-  /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_SUBSURFACE_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-#  endif /* __SUBSURFACE__ */
-#endif   /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
deleted file mode 100644
index c686f46a0cd..00000000000
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- */
-ccl_device void kernel_path_init(KernelGlobals *kg)
-{
-  int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-  /* This is the first assignment to ray_state;
-   * So we don't use ASSIGN_RAY_STATE macro.
-   */
-  kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
-
-  /* Get work. */
-  ccl_global uint *work_pools = kernel_split_params.work_pools;
-  uint total_work_size = kernel_split_params.total_work_size;
-  uint work_index;
-
-  if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-    /* No more work, mark ray as inactive */
-    kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
-
-    return;
-  }
-
-  ccl_global WorkTile *tile = &kernel_split_params.tile;
-  uint x, y, sample;
-  get_work_pixel(tile, work_index, &x, &y, &sample);
-
-  /* Store buffer offset for writing to passes. */
-  uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-  kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-  /* Initialize random numbers and ray. */
-  uint rng_hash;
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &kernel_split_state.ray[ray_index]);
-
-  if (kernel_split_state.ray[ray_index].t != 0.0f) {
-    /* Initialize throughput, path radiance, Ray, PathState;
-     * These rays proceed with path-iteration.
-     */
-    kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-    path_radiance_init(kg, &kernel_split_state.path_radiance[ray_index]);
-    path_state_init(kg,
-                    AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                    &kernel_split_state.path_state[ray_index],
-                    rng_hash,
-                    sample,
-                    &kernel_split_state.ray[ray_index]);
-#ifdef __SUBSURFACE__
-    kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-  }
-  else {
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
deleted file mode 100644
index 2db87f7a671..00000000000
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel enqueues rays of different ray state into their
- * appropriate queues:
- *
- * 1. Rays that have been determined to hit the background from the
- *    "kernel_scene_intersect" kernel are enqueued in
- *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in pat
- *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queue during other times this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
- *     and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-ccl_device void kernel_queue_enqueue(KernelGlobals *kg, ccl_local_param QueueEnqueueLocals *locals)
-{
-  /* We have only 2 cases (Hit/Not-Hit) */
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (lidx == 0) {
-    locals->queue_atomics[0] = 0;
-    locals->queue_atomics[1] = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int queue_number = -1;
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
-    queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-  }
-  else if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-    queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-  }
-
-  unsigned int my_lqidx;
-  if (queue_number != -1) {
-    my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (lidx == 0) {
-    locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(
-        QUEUE_ACTIVE_AND_REGENERATED_RAYS, locals->queue_atomics, kernel_split_params.queue_index);
-    locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(
-        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-        locals->queue_atomics,
-        kernel_split_params.queue_index);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  unsigned int my_gqidx;
-  if (queue_number != -1) {
-    my_gqidx = get_global_queue_index(
-        queue_number, kernel_split_params.queue_size, my_lqidx, locals->queue_atomics);
-    kernel_split_state.queue_data[my_gqidx] = ray_index;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
deleted file mode 100644
index 9ac95aafd2f..00000000000
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of scene_intersect function.
- *
- * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
- * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes
- * their ray state to RAY_HIT_BACKGROUND.
- */
-ccl_device void kernel_scene_intersect(KernelGlobals *kg)
-{
-  /* Fetch use_queues_flag */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  /* All regenerated rays become active here */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-#ifdef __BRANCHED_PATH__
-    if (kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
-      kernel_split_path_end(kg, ray_index);
-    }
-    else
-#endif /* __BRANCHED_PATH__ */
-    {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-    }
-  }
-
-  if (!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    return;
-  }
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-  Intersection isect;
-  const int last_object = state->bounce > 0 ?
-                              intersection_get_object(kg, &kernel_split_state.isect[ray_index]) :
-                              OBJECT_NONE;
-  bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L, last_object);
-  kernel_split_state.isect[ray_index] = isect;
-
-  if (!hit) {
-    /* Change the state of rays that hit the background;
-     * These rays undergo special processing in the
-     * background_bufferUpdate kernel.
-     */
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
deleted file mode 100644
index c760a2b2049..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel evaluates ShaderData structure from the values computed
- * by the previous kernels.
- */
-ccl_device void kernel_shader_eval(KernelGlobals *kg)
-{
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  /* Sorting on cuda split is not implemented */
-#ifdef __KERNEL_CUDA__
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-#else
-  int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
-#endif
-  if (ray_index >= queue_index) {
-    return;
-  }
-  ray_index = get_ray_index(kg,
-                            ray_index,
-#ifdef __KERNEL_CUDA__
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-#else
-                            QUEUE_SHADER_SORTED_RAYS,
-#endif
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, buffer, state->flag);
-#ifdef __BRANCHED_PATH__
-    if (kernel_data.integrator.branched) {
-      shader_merge_closures(kernel_split_sd(sd, ray_index));
-    }
-    else
-#endif
-    {
-      shader_prepare_closures(kernel_split_sd(sd, ray_index), state);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
deleted file mode 100644
index 551836d1653..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_setup.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel sets up the ShaderData structure from the values computed
- * by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- */
-ccl_device void kernel_shader_setup(KernelGlobals *kg,
-                                    ccl_local_param unsigned int *local_queue_atomics)
-{
-  /* Enqueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (ray_index < queue_index) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-  }
-  else {
-    ray_index = QUEUE_EMPTY_SLOT;
-  }
-
-  char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 :
-                                                                                               0;
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-  /* Continue on with shader evaluation. */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    Intersection isect = kernel_split_state.isect[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    shader_setup_from_ray(kg, sd, &isect, &ray);
-
-#ifdef __VOLUME__
-    if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME);
-    }
-#endif
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
deleted file mode 100644
index 95d33a42014..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local_param ShaderSortLocals *locals)
-{
-#ifndef __KERNEL_CUDA__
-  int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (tid == 0) {
-    kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
-  }
-
-  uint offset = (tid / SHADER_SORT_LOCAL_SIZE) * SHADER_SORT_BLOCK_SIZE;
-  if (offset >= qsize) {
-    return;
-  }
-
-  int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
-  uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
-  ccl_local uint *local_value = &locals->local_value[0];
-  ccl_local ushort *local_index = &locals->local_index[0];
-
-  /* copy to local memory */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint add = input + idx;
-    uint value = (~0);
-    if (idx < qsize) {
-      int ray_index = kernel_split_state.queue_data[add];
-      bool valid = (ray_index != QUEUE_EMPTY_SLOT) &&
-                   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-      if (valid) {
-        value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK;
-      }
-    }
-    local_value[i + lid] = value;
-    local_index[i + lid] = i + lid;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* skip sorting for cpu split kernel */
-#  ifdef __KERNEL_OPENCL__
-
-  /* bitonic sort */
-  for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-    for (uint inc = length; inc > 0; inc >>= 1) {
-      for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
-        uint i = lid + ii;
-        bool direction = ((i & (length << 1)) != 0);
-        uint j = i ^ inc;
-        ushort ioff = local_index[i];
-        ushort joff = local_index[j];
-        uint iKey = local_value[ioff];
-        uint jKey = local_value[joff];
-        bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
-        bool swap = smaller ^ (j < i) ^ direction;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-        local_index[i] = (swap) ? joff : ioff;
-        local_index[j] = (swap) ? ioff : joff;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      }
-    }
-  }
-#  endif /* __KERNEL_OPENCL__ */
-
-  /* copy to destination */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint lidx = local_index[i + lid];
-    uint outi = output + idx;
-    uint ini = input + offset + lidx;
-    uint value = local_value[lidx];
-    if (idx < qsize) {
-      kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT :
-                                                              kernel_split_state.queue_data[ini];
-    }
-  }
-#endif /* __KERNEL_CUDA__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
deleted file mode 100644
index 5d772fc597b..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for AO. */
-ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
-{
-  unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < ao_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-#ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched ||
-      IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-    kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
-#ifdef __BRANCHED_PATH__
-  }
-  else {
-    kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
-  }
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
deleted file mode 100644
index 5e46d300bca..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for direct visible light. */
-ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
-{
-  unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < dl_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-#ifdef __BRANCHED_PATH__
-  /* TODO(mai): move this somewhere else? */
-  if (thread_index == 0) {
-    /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
-    kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
-  }
-#endif /* __BRANCHED_PATH__ */
-
-  if (ray_index == QUEUE_EMPTY_SLOT)
-    return;
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.light_ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-  BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  bool is_lamp = kernel_split_state.is_lamp[ray_index];
-
-#if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
-  bool use_branched = false;
-  int all = 0;
-
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    use_branched = true;
-    all = 1;
-  }
-#  if defined(__BRANCHED_PATH__)
-  else if (kernel_data.integrator.branched) {
-    use_branched = true;
-
-    if (IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-      all = (kernel_data.integrator.sample_all_lights_indirect);
-    }
-    else {
-      all = (kernel_data.integrator.sample_all_lights_direct);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-  if (use_branched) {
-    kernel_branched_path_surface_connect_light(
-        kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-  }
-  else
-#endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
-  {
-    /* trace shadow ray */
-    float3 shadow;
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &ray, &shadow)) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
deleted file mode 100644
index 5114f2b03e5..00000000000
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_H__
-#define __KERNEL_SPLIT_H__
-
-// clang-format off
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "kernel/kernels/opencl/kernel_opencl_image.h"
-#endif
-#ifdef __KERNEL_CUDA__
-#  include "kernel/kernels/cuda/kernel_cuda_image.h"
-#endif
-#ifdef __KERNEL_CPU__
-#  include "kernel/kernels/cpu/kernel_cpu_image.h"
-#endif
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_path.h"
-#ifdef __BRANCHED_PATH__
-#  include "kernel/kernel_path_branched.h"
-#endif
-
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
-
-#ifdef __BRANCHED_PATH__
-#  include "kernel/split/kernel_branched.h"
-#endif
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __BRANCHED_PATH__
-#  ifdef __SUBSURFACE__
-  ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-  if (ss_indirect->num_rays) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-  else
-#  endif /* __SUBSURFACE__ */
-      if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
-    int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
-
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_accum_sample(orig_ray_L, L);
-
-    atomic_fetch_and_dec_uint32(
-        (ccl_global uint *)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
-
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
-  }
-  else {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-#else
-  ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
deleted file mode 100644
index decc537b39b..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_H__
-#define __KERNEL_SPLIT_DATA_H__
-
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "kernel/kernel_globals.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
-{
-  (void)kg; /* Unused on CPU. */
-
-  uint64_t size = 0;
-#define SPLIT_DATA_ENTRY(type, name, num) +align_up(num_elements *num * sizeof(type), 16)
-  size = size SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  return size;
-}
-
-ccl_device_inline void split_data_init(KernelGlobals *kg,
-                                       ccl_global SplitData *split_data,
-                                       size_t num_elements,
-                                       ccl_global void *data,
-                                       ccl_global char *ray_state)
-{
-  (void)kg; /* Unused on CPU. */
-
-  ccl_global char *p = (ccl_global char *)data;
-
-#define SPLIT_DATA_ENTRY(type, name, num) \
-  split_data->name = (type *)p; \
-  p += align_up(num_elements * num * sizeof(type), 16);
-  SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  split_data->_branched_state_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  split_data->_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  split_data->ray_state = ray_state;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
deleted file mode 100644
index 06bdce9947d..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
-#define __KERNEL_SPLIT_DATA_TYPES_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* parameters used by the split kernels, we use a single struct to avoid passing these to each
- * kernel */
-
-typedef struct SplitParams {
-  WorkTile tile;
-  uint total_work_size;
-
-  ccl_global unsigned int *work_pools;
-
-  ccl_global int *queue_index;
-  int queue_size;
-  ccl_global char *use_queues_flag;
-
-  /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
-  int dummy_sd_flag;
-} SplitParams;
-
-/* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
-
-/* SPLIT_DATA_ENTRY(type, name, num) */
-
-#ifdef __BRANCHED_PATH__
-
-typedef ccl_global struct SplitBranchedState {
-  /* various state that must be kept and restored after an indirect loop */
-  PathState path_state;
-  float3 throughput;
-  Ray ray;
-
-  Intersection isect;
-
-  char ray_state;
-
-  /* indirect loop state */
-  int next_closure;
-  int next_sample;
-
-#  ifdef __SUBSURFACE__
-  int ss_next_closure;
-  int ss_next_sample;
-  int next_hit;
-  int num_hits;
-
-  uint lcg_state;
-  LocalIntersection ss_isect;
-#  endif /* __SUBSURFACE__ */
-
-  int shared_sample_count; /* number of branched samples shared with other threads */
-  int original_ray;        /* index of original ray when sharing branched samples */
-  bool waiting_on_shared_samples;
-} SplitBranchedState;
-
-#  define SPLIT_DATA_BRANCHED_ENTRIES \
-    SPLIT_DATA_ENTRY(SplitBranchedState, branched_state, 1) \
-    SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0)
-#else
-#  define SPLIT_DATA_BRANCHED_ENTRIES
-#endif /* __BRANCHED_PATH__ */
-
-#ifdef __SUBSURFACE__
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES \
-    SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
-#else
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES
-#endif /* __SUBSURFACE__ */
-
-#ifdef __VOLUME__
-#  define SPLIT_DATA_VOLUME_ENTRIES SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
-#else
-#  define SPLIT_DATA_VOLUME_ENTRIES
-#endif /* __VOLUME__ */
-
-#define SPLIT_DATA_ENTRIES \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY( \
-      ccl_global int, queue_data, (NUM_QUEUES * 2)) /* TODO(mai): this is too large? */ \
-  SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* Entries to be copied to inactive rays when sharing branched samples
- * (TODO: which are actually needed?) */
-#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* struct that holds pointers to data in the shared state buffer */
-typedef struct SplitData {
-#define SPLIT_DATA_ENTRY(type, name, num) type *name;
-  SPLIT_DATA_ENTRIES
-#undef SPLIT_DATA_ENTRY
-
-  /* this is actually in a separate buffer from the rest of the split state data (so it can be read
-   * back from the host easily) but is still used the same as the other data so we have it here in
-   * this struct as well
-   */
-  ccl_global char *ray_state;
-} SplitData;
-
-#ifndef __KERNEL_CUDA__
-#  define kernel_split_state (kg->split_data)
-#  define kernel_split_params (kg->split_param_data)
-#else
-__device__ SplitData __split_data;
-#  define kernel_split_state (__split_data)
-__device__ SplitParams __split_param_data;
-#  define kernel_split_params (__split_param_data)
-#endif /* __KERNEL_CUDA__ */
-
-#define kernel_split_sd(sd, ray_index) \
-  ((ShaderData *)(((ccl_global char *)kernel_split_state._##sd) + \
-                  (sizeof(ShaderData) + \
-                   sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1)) * \
-                      (ray_index)))
-
-/* Local storage for queue_enqueue kernel. */
-typedef struct QueueEnqueueLocals {
-  uint queue_atomics[2];
-} QueueEnqueueLocals;
-
-/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
-typedef struct BackgroundAOLocals {
-  uint queue_atomics_bg;
-  uint queue_atomics_ao;
-} BackgroundAOLocals;
-
-typedef struct ShaderSortLocals {
-  uint local_value[SHADER_SORT_BLOCK_SIZE];
-  ushort local_index[SHADER_SORT_BLOCK_SIZE];
-} ShaderSortLocals;
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
deleted file mode 100644
index ba06ae3bc53..00000000000
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
-
-ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg,
-                                                                                 int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  branched_state->ss_next_closure = 0;
-  branched_state->ss_next_sample = 0;
-
-  branched_state->num_hits = 0;
-  branched_state->next_hit = 0;
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(
-    KernelGlobals *kg, int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  for (int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* set up random number generator */
-    if (branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
-        branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-      branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
-                                                           0x68bc21eb);
-    }
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = branched_state->ss_next_sample; j < num_samples; j++) {
-      ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
-      *hit_state = branched_state->path_state;
-      hit_state->rng_hash = bssrdf_rng_hash;
-      path_state_branch(hit_state, j, num_samples);
-
-      ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_branched_rng_2D(
-          kg, bssrdf_rng_hash, hit_state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-      /* intersection is expensive so avoid doing multiple times for the same input */
-      if (branched_state->next_hit == 0 && branched_state->next_closure == 0 &&
-          branched_state->next_sample == 0) {
-        uint lcg_state = branched_state->lcg_state;
-        LocalIntersection ss_isect_private;
-
-        branched_state->num_hits = subsurface_scatter_multi_intersect(
-            kg, &ss_isect_private, sd, hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-        branched_state->lcg_state = lcg_state;
-        *ss_isect = ss_isect_private;
-      }
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-#  ifdef __VOLUME__
-      Ray volume_ray = branched_state->ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
-        ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
-        *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
-                           * important as the indirect path will write into bssrdf_sd */
-
-        LocalIntersection ss_isect_private = *ss_isect;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect_private, hit, bssrdf_sd, hit_state, bssrdf_type, bssrdf_roughness);
-        *ss_isect = ss_isect_private;
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-
-#  ifdef __EMISSION__
-        if (branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-          /* direct light */
-          if (kernel_data.integrator.use_direct_light) {
-            int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                      (hit_state->flag & PATH_RAY_SHADOW_CATCHER);
-            kernel_branched_path_surface_connect_light(kg,
-                                                       bssrdf_sd,
-                                                       emission_sd,
-                                                       hit_state,
-                                                       branched_state->throughput,
-                                                       num_samples_inv,
-                                                       L,
-                                                       all);
-          }
-        }
-#  endif /* __EMISSION__ */
-
-        /* indirect light */
-        if (kernel_split_branched_path_surface_indirect_light_iter(
-                kg, ray_index, num_samples_inv, bssrdf_sd, false, false)) {
-          branched_state->ss_next_closure = i;
-          branched_state->ss_next_sample = j;
-          branched_state->next_hit = hit;
-
-          return true;
-        }
-
-        branched_state->next_closure = 0;
-      }
-
-      branched_state->next_hit = 0;
-    }
-
-    branched_state->ss_next_sample = 0;
-  }
-
-  branched_state->ss_next_closure = sd->num_closure;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
-
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-
-#ifdef __SUBSURFACE__
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    if (sd->flag & SD_BSSRDF) {
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif
-        if (kernel_path_subsurface_scatter(
-                kg, sd, emission_sd, L, state, ray, throughput, ss_indirect)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
-  }
-
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_SUBSURFACE_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 000da1fa615..4aee1ef11b3 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -48,16 +48,18 @@ ccl_device_inline float3 stack_load_float3(float *stack, uint a)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  return make_float3(stack[a + 0], stack[a + 1], stack[a + 2]);
+  float *stack_a = stack + a;
+  return make_float3(stack_a[0], stack_a[1], stack_a[2]);
 }
 
 ccl_device_inline void stack_store_float3(float *stack, uint a, float3 f)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  stack[a + 0] = f.x;
-  stack[a + 1] = f.y;
-  stack[a + 2] = f.z;
+  float *stack_a = stack + a;
+  stack_a[0] = f.x;
+  stack_a[1] = f.y;
+  stack_a[2] = f.z;
 }
 
 ccl_device_inline float stack_load_float(float *stack, uint a)
@@ -105,14 +107,14 @@ ccl_device_inline bool stack_valid(uint a)
 
 /* Reading Nodes */
 
-ccl_device_inline uint4 read_node(KernelGlobals *kg, int *offset)
+ccl_device_inline uint4 read_node(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   (*offset)++;
   return node;
 }
 
-ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
+ccl_device_inline float4 read_node_float(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   float4 f = make_float4(__uint_as_float(node.x),
@@ -123,7 +125,7 @@ ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
   return f;
 }
 
-ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
+ccl_device_inline float4 fetch_node_float(const KernelGlobals *kg, int offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, offset);
   return make_float4(__uint_as_float(node.x),
@@ -217,26 +219,11 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void svm_eval_nodes(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ccl_addr_space PathState *state,
-                                      ccl_global float *buffer,
-                                      ShaderType type,
-                                      int path_flag)
-{
-  optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag);
-}
-extern "C" __device__ void __direct_callable__svm_eval_nodes(
-#else
-ccl_device_noinline void svm_eval_nodes(
-#endif
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ccl_global float *buffer,
-    ShaderType type,
-    int path_flag)
+template<uint node_feature_mask, ShaderType type>
+ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS,
+                               ShaderData *sd,
+                               ccl_global float *render_buffer,
+                               int path_flag)
 {
   float stack[SVM_STACK_SIZE];
   int offset = sd->shader & SHADER_MASK;
@@ -247,7 +234,6 @@ ccl_device_noinline void svm_eval_nodes(
     switch (node.x) {
       case NODE_END:
         return;
-#if NODES_GROUP(NODE_GROUP_LEVEL_0)
       case NODE_SHADER_JUMP: {
         if (type == SHADER_TYPE_SURFACE)
           offset = node.y;
@@ -260,13 +246,18 @@ ccl_device_noinline void svm_eval_nodes(
         break;
       }
       case NODE_CLOSURE_BSDF:
-        svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset);
+        offset = svm_node_closure_bsdf<node_feature_mask, type>(
+            kg, sd, stack, node, path_flag, offset);
         break;
       case NODE_CLOSURE_EMISSION:
-        svm_node_closure_emission(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_emission(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_BACKGROUND:
-        svm_node_closure_background(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_background(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_SET_WEIGHT:
         svm_node_closure_set_weight(sd, node.y, node.z, node.w);
@@ -275,7 +266,9 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_closure_weight(sd, stack, node.y);
         break;
       case NODE_EMISSION_WEIGHT:
-        svm_node_emission_weight(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_emission_weight(kg, sd, stack, node);
+        }
         break;
       case NODE_MIX_CLOSURE:
         svm_node_mix_closure(sd, stack, node);
@@ -295,86 +288,108 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_TEX_COORD:
-        svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+        offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
         break;
       case NODE_VALUE_F:
         svm_node_value_f(kg, sd, stack, node.y, node.z);
         break;
       case NODE_VALUE_V:
-        svm_node_value_v(kg, sd, stack, node.y, &offset);
+        offset = svm_node_value_v(kg, sd, stack, node.y, offset);
         break;
       case NODE_ATTR:
-        svm_node_attr(kg, sd, stack, node);
+        svm_node_attr<node_feature_mask>(kg, sd, stack, node);
         break;
       case NODE_VERTEX_COLOR:
         svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_GEOMETRY_BUMP_DX:
-        svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_GEOMETRY_BUMP_DY:
-        svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_SET_DISPLACEMENT:
-        svm_node_set_displacement(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_displacement(kg, sd, stack, node.y);
+        }
         break;
       case NODE_DISPLACEMENT:
-        svm_node_displacement(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_displacement(kg, sd, stack, node);
+        }
         break;
       case NODE_VECTOR_DISPLACEMENT:
-        svm_node_vector_displacement(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_vector_displacement(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_TEX_IMAGE:
-        svm_node_tex_image(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_image(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_IMAGE_BOX:
         svm_node_tex_image_box(kg, sd, stack, node);
         break;
       case NODE_TEX_NOISE:
-        svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_SET_BUMP:
-        svm_node_set_bump(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_bump(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DX:
-        svm_node_attr_bump_dx(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dx(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DY:
-        svm_node_attr_bump_dy(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dy(kg, sd, stack, node);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DX:
-        svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DY:
-        svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DX:
-        svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DY:
-        svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_CLOSURE_SET_NORMAL:
-        svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        }
         break;
-#    if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
       case NODE_ENTER_BUMP_EVAL:
-        svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        }
         break;
       case NODE_LEAVE_BUMP_EVAL:
-        svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        }
         break;
-#    endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
-#  endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_HSV:
-        svm_node_hsv(kg, sd, stack, node, &offset);
+        svm_node_hsv(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_1)
       case NODE_CLOSURE_HOLDOUT:
         svm_node_closure_holdout(sd, stack, node);
         break;
@@ -384,22 +399,24 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_LAYER_WEIGHT:
         svm_node_layer_weight(sd, stack, node);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_CLOSURE_VOLUME:
-        svm_node_closure_volume(kg, sd, stack, node, type);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          svm_node_closure_volume<type>(kg, sd, stack, node);
+        }
         break;
       case NODE_PRINCIPLED_VOLUME:
-        svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_MATH:
-        svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_math(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_VECTOR_MATH:
-        svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_RGB_RAMP:
-        svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+        offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
         break;
       case NODE_GAMMA:
         svm_node_gamma(sd, stack, node.y, node.z, node.w);
@@ -408,7 +425,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_brightness(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_LIGHT_PATH:
-        svm_node_light_path(sd, state, stack, node.y, node.z, path_flag);
+        svm_node_light_path(INTEGRATOR_STATE_PASS, sd, stack, node.y, node.z, path_flag);
         break;
       case NODE_OBJECT_INFO:
         svm_node_object_info(kg, sd, stack, node.y, node.z);
@@ -416,22 +433,22 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_PARTICLE_INFO:
         svm_node_particle_info(kg, sd, stack, node.y, node.z);
         break;
-#  if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR)
+#if defined(__HAIR__)
       case NODE_HAIR_INFO:
-        svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(HAIR)) {
+          svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_2)
       case NODE_TEXTURE_MAPPING:
-        svm_node_texture_mapping(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_MAPPING:
-        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIN_MAX:
-        svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_CAMERA:
         svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
@@ -440,47 +457,46 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_tex_environment(kg, sd, stack, node);
         break;
       case NODE_TEX_SKY:
-        svm_node_tex_sky(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_sky(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_GRADIENT:
         svm_node_tex_gradient(sd, stack, node);
         break;
       case NODE_TEX_VORONOI:
-        svm_node_tex_voronoi(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_voronoi<node_feature_mask>(
+            kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_MUSGRAVE:
-        svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_WAVE:
-        svm_node_tex_wave(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_wave(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_MAGIC:
-        svm_node_tex_magic(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_magic(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_CHECKER:
         svm_node_tex_checker(kg, sd, stack, node);
         break;
       case NODE_TEX_BRICK:
-        svm_node_tex_brick(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_brick(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_WHITE_NOISE:
-        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_NORMAL:
-        svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_LIGHT_FALLOFF:
         svm_node_light_falloff(sd, stack, node);
         break;
       case NODE_IES:
-        svm_node_ies(kg, sd, stack, node, &offset);
+        svm_node_ies(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_3)
       case NODE_RGB_CURVES:
       case NODE_VECTOR_CURVES:
-        svm_node_curves(kg, sd, stack, node, &offset);
+        offset = svm_node_curves(kg, sd, stack, node, offset);
         break;
       case NODE_TANGENT:
         svm_node_tangent(kg, sd, stack, node);
@@ -492,7 +508,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_invert(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIX:
-        svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_SEPARATE_VECTOR:
         svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
@@ -501,10 +517,10 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_SEPARATE_HSV:
-        svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_COMBINE_HSV:
-        svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_VECTOR_ROTATE:
         svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
@@ -522,39 +538,36 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_blackbody(kg, sd, stack, node.y, node.z);
         break;
       case NODE_MAP_RANGE:
-        svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_CLAMP:
-        svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  ifdef __SHADER_RAYTRACE__
+#ifdef __SHADER_RAYTRACE__
       case NODE_BEVEL:
-        svm_node_bevel(kg, sd, state, stack, node);
+        svm_node_bevel<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
       case NODE_AMBIENT_OCCLUSION:
-        svm_node_ao(kg, sd, state, stack, node);
+        svm_node_ao<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
-#  endif /* __SHADER_RAYTRACE__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_4)
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_TEX_VOXEL:
-        svm_node_tex_voxel(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_AOV_START:
-        if (!svm_node_aov_check(state, buffer)) {
+        if (!svm_node_aov_check(path_flag, render_buffer)) {
           return;
         }
         break;
       case NODE_AOV_COLOR:
-        svm_node_aov_color(kg, sd, stack, node, buffer);
+        svm_node_aov_color(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
       case NODE_AOV_VALUE:
-        svm_node_aov_value(kg, sd, stack, node, buffer);
+        svm_node_aov_value(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */
       default:
         kernel_assert(!"Unknown node type was passed to the SVM machine");
         return;
diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h
index 4cb986b897a..34ac2cb8fbf 100644
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@@ -14,20 +14,25 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
-ccl_device_noinline float svm_ao(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float3 N,
-                                 ccl_addr_space PathState *state,
-                                 float max_dist,
-                                 int num_samples,
-                                 int flags)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float __direct_callable__svm_node_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float svm_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                           ShaderData *sd,
+                                                           float3 N,
+                                                           float max_dist,
+                                                           int num_samples,
+                                                           int flags)
 {
   if (flags & NODE_AO_GLOBAL_RADIUS) {
-    max_dist = kernel_data.background.ao_distance;
+    max_dist = kernel_data.integrator.ao_bounces_distance;
   }
 
   /* Early out if no sampling needed. */
@@ -47,11 +52,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   float3 T, B;
   make_orthonormals(N, &T, &B);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   int unoccluded = 0;
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     float2 d = concentric_sample_disk(disk_u, disk_v);
     float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d)));
@@ -62,8 +70,8 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
     ray.D = D.x * T + D.y * B + D.z * N;
     ray.t = max_dist;
     ray.time = sd->time;
-    ray.dP = sd->dP;
-    ray.dD = differential3_zero();
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
 
     if (flags & NODE_AO_ONLY_LOCAL) {
       if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) {
@@ -81,8 +89,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   return ((float)unoccluded) / num_samples;
 }
 
-ccl_device void svm_node_ao(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_ao(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint flags, dist_offset, normal_offset, out_ao_offset;
   svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
@@ -92,7 +106,16 @@ ccl_device void svm_node_ao(
 
   float dist = stack_load_float_default(stack, dist_offset, node.w);
   float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
-  float ao = svm_ao(kg, sd, normal, state, dist, samples, flags);
+
+  float ao = 1.0f;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    ao = optixDirectCall<float>(0, INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  else
+    ao = svm_ao(INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  endif
+  }
 
   if (stack_valid(out_ao_offset)) {
     stack_store_float(stack, out_ao_offset, ao);
diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h
index 899e466d099..26dec9717b3 100644
--- a/intern/cycles/kernel/svm/svm_aov.h
+++ b/intern/cycles/kernel/svm/svm_aov.h
@@ -14,36 +14,50 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool svm_node_aov_check(ccl_addr_space PathState *state,
-                                          ccl_global float *buffer)
+ccl_device_inline bool svm_node_aov_check(const int path_flag, ccl_global float *render_buffer)
 {
-  int path_flag = state->flag;
-
   bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));
 
-  return ((buffer != NULL) && is_primary);
+  return ((render_buffer != NULL) && is_primary);
 }
 
-ccl_device void svm_node_aov_color(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_color(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float3 val = stack_load_float3(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float4(buffer + kernel_data.film.pass_aov_color + 4 * node.z,
-                             make_float4(val.x, val.y, val.z, 1.0f));
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_color + node.z);
+    kernel_write_pass_float3(buffer, make_float3(val.x, val.y, val.z));
   }
 }
 
-ccl_device void svm_node_aov_value(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_value(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float val = stack_load_float(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float(buffer + kernel_data.film.pass_aov_value + node.z, val);
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_value + node.z);
+    kernel_write_pass_float(buffer, val);
   }
 }
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 62740824ad1..5f94b20af73 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -18,8 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute Node */
 
-ccl_device AttributeDescriptor svm_node_attr_init(
-    KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeOutputType *type, uint *out_offset)
+ccl_device AttributeDescriptor svm_node_attr_init(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  uint4 node,
+                                                  NodeAttributeOutputType *type,
+                                                  uint *out_offset)
 {
   *out_offset = node.z;
   *type = (NodeAttributeOutputType)node.w;
@@ -44,31 +47,37 @@ ccl_device AttributeDescriptor svm_node_attr_init(
   return desc;
 }
 
-ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+template<uint node_feature_mask>
+ccl_device_noinline void svm_node_attr(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
   AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
 #ifdef __VOLUME__
-  /* Volumes
-   * NOTE: moving this into its own node type might help improve performance. */
-  if (primitive_is_volume_attribute(sd, desc)) {
-    const float4 value = volume_attribute_float4(kg, sd, desc);
+  if (KERNEL_NODES_FEATURE(VOLUME)) {
+    /* Volumes
+     * NOTE: moving this into its own node type might help improve performance. */
+    if (primitive_is_volume_attribute(sd, desc)) {
+      const float4 value = volume_attribute_float4(kg, sd, desc);
 
-    if (type == NODE_ATTR_OUTPUT_FLOAT) {
-      const float f = volume_attribute_value_to_float(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
-      const float3 f = volume_attribute_value_to_float3(value);
-      stack_store_float3(stack, out_offset, f);
+      if (type == NODE_ATTR_OUTPUT_FLOAT) {
+        const float f = volume_attribute_value_to_float(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
+        const float3 f = volume_attribute_value_to_float3(value);
+        stack_store_float3(stack, out_offset, f);
+      }
+      else {
+        const float f = volume_attribute_value_to_alpha(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      return;
     }
-    else {
-      const float f = volume_attribute_value_to_alpha(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    return;
   }
 #endif
 
@@ -139,7 +148,10 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
   }
 }
 
-ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dx(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
@@ -232,7 +244,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *
   }
 }
 
-ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dy(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index bf5957ec9e4..aab089d19ea 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -14,21 +14,95 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
+/* Planar Cubic BSSRDF falloff, reused for bevel.
+ *
+ * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
+ * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
+ * far as I can tell has no closed form solution. So we get an iterative solution
+ * instead with newton-raphson. */
+
+ccl_device float svm_bevel_cubic_eval(const float radius, float r)
+{
+  const float Rm = radius;
+
+  if (r >= Rm)
+    return 0.0f;
+
+  /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
+  const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
+  const float f = Rm - r;
+  const float num = f * f * f;
+
+  return (10.0f * num) / (Rm5 * M_PI_F);
+}
+
+ccl_device float svm_bevel_cubic_pdf(const float radius, float r)
+{
+  return svm_bevel_cubic_eval(radius, r);
+}
+
+/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
+ccl_device_forceinline float svm_bevel_cubic_quintic_root_find(float xi)
+{
+  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
+   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
+   * should not be too bad */
+  const float tolerance = 1e-6f;
+  const int max_iteration_count = 10;
+  float x = 0.25f;
+  int i;
+
+  for (i = 0; i < max_iteration_count; i++) {
+    float x2 = x * x;
+    float x3 = x2 * x;
+    float nx = (1.0f - x);
+
+    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
+    float f_ = 20.0f * (x * nx) * (nx * nx);
+
+    if (fabsf(f) < tolerance || f_ == 0.0f)
+      break;
+
+    x = saturate(x - f / f_);
+  }
+
+  return x;
+}
+
+ccl_device void svm_bevel_cubic_sample(const float radius, float xi, float *r, float *h)
+{
+  float Rm = radius;
+  float r_ = svm_bevel_cubic_quintic_root_find(xi);
+
+  r_ *= Rm;
+  *r = r_;
+
+  /* h^2 + r^2 = Rm^2 */
+  *h = safe_sqrtf(Rm * Rm - r_ * r_);
+}
+
 /* Bevel shader averaging normals from nearby surfaces.
  *
  * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013
  * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
  */
 
-ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ccl_addr_space PathState *state,
-                                     float radius,
-                                     int num_samples)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float3 __direct_callable__svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float3 svm_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                               ShaderData *sd,
+                                                               float radius,
+                                                               int num_samples)
 {
   /* Early out if no sampling needed. */
   if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
@@ -41,21 +115,27 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   }
 
   /* Don't bevel for blurry indirect rays. */
-  if (state->min_ray_pdf < 8.0f) {
+  if (INTEGRATOR_STATE(path, min_ray_pdf) < 8.0f) {
     return sd->N;
   }
 
   /* Setup for multi intersection. */
   LocalIntersection isect;
-  uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e);
+  uint lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                  INTEGRATOR_STATE(path, rng_offset),
+                                  INTEGRATOR_STATE(path, sample),
+                                  0x64c6a40e);
 
   /* Sample normals from surrounding points on surface. */
   float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     /* Pick random axis in local frame and point on disk. */
     float3 disk_N, disk_T, disk_B;
@@ -97,7 +177,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     float disk_height;
 
     /* Perhaps find something better than Cubic BSSRDF, but happens to work well. */
-    bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height);
+    svm_bevel_cubic_sample(radius, disk_r, &disk_r, &disk_height);
 
     float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
 
@@ -106,8 +186,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     ray->P = sd->P + disk_N * disk_height + disk_P;
     ray->D = -disk_N;
     ray->t = 2.0f * disk_height;
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
+    ray->dP = differential_zero_compact();
+    ray->dD = differential_zero_compact();
     ray->time = sd->time;
 
     /* Intersect with the same object. if multiple intersections are found it
@@ -120,14 +200,16 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       /* Quickly retrieve P and Ng without setting up ShaderData. */
       float3 hit_P;
       if (sd->type & PRIMITIVE_TRIANGLE) {
-        hit_P = triangle_refine_local(kg, sd, &isect.hits[hit], ray);
+        hit_P = triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim);
       }
 #  ifdef __OBJECT_MOTION__
       else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
         float3 verts[3];
         motion_triangle_vertices(
             kg, sd->object, kernel_tex_fetch(__prim_index, isect.hits[hit].prim), sd->time, verts);
-        hit_P = motion_triangle_refine_local(kg, sd, &isect.hits[hit], ray, verts);
+        hit_P = motion_triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim, verts);
       }
 #  endif /* __OBJECT_MOTION__ */
 
@@ -183,8 +265,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       float r = len(hit_P - sd->P);
 
       /* Compute weight. */
-      float pdf = bssrdf_cubic_pdf(radius, 0.0f, r);
-      float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r);
+      float pdf = svm_bevel_cubic_pdf(radius, r);
+      float disk_pdf = svm_bevel_cubic_pdf(radius, disk_r);
 
       w *= pdf / disk_pdf;
 
@@ -198,19 +280,34 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
 }
 
-ccl_device void svm_node_bevel(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint num_samples, radius_offset, normal_offset, out_offset;
   svm_unpack_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
 
   float radius = stack_load_float(stack, radius_offset);
-  float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples);
 
-  if (stack_valid(normal_offset)) {
-    /* Preserve input normal. */
-    float3 ref_N = stack_load_float3(stack, normal_offset);
-    bevel_N = normalize(ref_N + (bevel_N - sd->N));
+  float3 bevel_N = sd->N;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    bevel_N = optixDirectCall<float3>(1, INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  else
+    bevel_N = svm_bevel(INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  endif
+
+    if (stack_valid(normal_offset)) {
+      /* Preserve input normal. */
+      float3 ref_N = stack_load_float3(stack, normal_offset);
+      bevel_N = normalize(ref_N + (bevel_N - sd->N));
+    }
   }
 
   stack_store_float3(stack, out_offset, bevel_N);
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index adfc50d961e..96b3703b954 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -34,8 +34,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Blackbody Node */
 
-ccl_device void svm_node_blackbody(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
+ccl_device_noinline void svm_node_blackbody(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint temperature_offset,
+                                            uint col_offset)
 {
   /* Input */
   float temperature = stack_load_float(stack, temperature_offset);
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 6984afa30a5..dca1b220dd5 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -72,12 +72,12 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
   return make_float2(tint, mortar);
 }
 
-ccl_device void svm_node_tex_brick(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_brick(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
-  uint4 node4 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
+  uint4 node4 = read_node(kg, &offset);
 
   /* Input and Output Sockets */
   uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset;
@@ -133,6 +133,7 @@ ccl_device void svm_node_tex_brick(
     stack_store_float3(stack, color_offset, color1 * (1.0f - f) + mortar * f);
   if (stack_valid(fac_offset))
     stack_store_float(stack, fac_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 9554b5946fb..2ed812acd71 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_brightness(
+ccl_device_noinline void svm_node_brightness(
     ShaderData *sd, float *stack, uint in_color, uint out_color, uint node)
 {
   uint bright_offset, contrast_offset;
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index c9d430a2bba..8672839dbab 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Bump Eval Nodes */
 
-ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_enter_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* save state */
   stack_store_float3(stack, offset + 0, sd->P);
@@ -45,10 +45,10 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
   }
 }
 
-ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_leave_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* restore state */
   sd->P = stack_load_float3(stack, offset + 0);
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 21a17acf5f1..40c0edcdad0 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_camera(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint out_vector,
-                                uint out_zdepth,
-                                uint out_distance)
+ccl_device_noinline void svm_node_camera(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint out_vector,
+                                         uint out_zdepth,
+                                         uint out_distance)
 {
   float distance;
   float zdepth;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index d54cb73df91..a9919c9ddc9 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -32,7 +32,10 @@ ccl_device float svm_checker(float3 p)
   return ((xi % 2 == yi % 2) == (zi % 2)) ? 1.0f : 0.0f;
 }
 
-ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_checker(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint4 node)
 {
   uint co_offset, color1_offset, color2_offset, scale_offset;
   uint color_offset, fac_offset;
diff --git a/intern/cycles/kernel/svm/svm_clamp.h b/intern/cycles/kernel/svm/svm_clamp.h
index a85fd82754e..656bd31c085 100644
--- a/intern/cycles/kernel/svm/svm_clamp.h
+++ b/intern/cycles/kernel/svm/svm_clamp.h
@@ -18,18 +18,18 @@ CCL_NAMESPACE_BEGIN
 
 /* Clamp Node */
 
-ccl_device void svm_node_clamp(KernelGlobals *kg,
-                               ShaderData *sd,
-                               float *stack,
-                               uint value_stack_offset,
-                               uint parameters_stack_offsets,
-                               uint result_stack_offset,
-                               int *offset)
+ccl_device_noinline int svm_node_clamp(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint value_stack_offset,
+                                       uint parameters_stack_offsets,
+                                       uint result_stack_offset,
+                                       int offset)
 {
   uint min_stack_offset, max_stack_offset, type;
   svm_unpack_node_uchar3(parameters_stack_offsets, &min_stack_offset, &max_stack_offset, &type);
 
-  uint4 defaults = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float min = stack_load_float_default(stack, min_stack_offset, defaults.x);
@@ -41,6 +41,7 @@ ccl_device void svm_node_clamp(KernelGlobals *kg,
   else {
     stack_store_float(stack, result_stack_offset, clamp(value, min, max));
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index bbe8d72edf0..e2f6dde4ace 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -57,13 +57,9 @@ ccl_device void svm_node_glass_setup(
   }
 }
 
-ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint4 node,
-                                      ShaderType shader_type,
-                                      int path_flag,
-                                      int *offset)
+template<uint node_feature_mask, ShaderType shader_type>
+ccl_device_noinline int svm_node_closure_bsdf(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
   uint type, param1_offset, param2_offset;
 
@@ -73,19 +69,19 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                                        1.0f);
 
   /* note we read this extra node before weight check, so offset is added */
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
 
   /* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */
-  if (mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) {
+  if ((!KERNEL_NODES_FEATURE(BSDF) || shader_type != SHADER_TYPE_SURFACE) || mix_weight == 0.0f) {
     if (type == CLOSURE_BSDF_PRINCIPLED_ID) {
       /* Read all principled BSDF extra data to get the right offset. */
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
     }
 
-    return;
+    return offset;
   }
 
   float3 N = stack_valid(data_node.x) ? stack_load_float3(stack, data_node.x) : sd->N;
@@ -102,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
           sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset,
           eta_offset, transmission_offset, anisotropic_rotation_offset,
           transmission_roughness_offset;
-      uint4 data_node2 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
 
       float3 T = stack_load_float3(stack, data_node.y);
       svm_unpack_node_uchar4(data_node.z,
@@ -158,7 +154,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       float specular_weight = (1.0f - final_transmission);
 
       // get the base color
-      uint4 data_base_color = read_node(kg, offset);
+      uint4 data_base_color = read_node(kg, &offset);
       float3 base_color = stack_valid(data_base_color.x) ?
                               stack_load_float3(stack, data_base_color.x) :
                               make_float3(__uint_as_float(data_base_color.y),
@@ -166,16 +162,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                           __uint_as_float(data_base_color.w));
 
       // get the additional clearcoat normal and subsurface scattering radius
-      uint4 data_cn_ssr = read_node(kg, offset);
+      uint4 data_cn_ssr = read_node(kg, &offset);
       float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ?
                                     stack_load_float3(stack, data_cn_ssr.x) :
                                     sd->N;
       float3 subsurface_radius = stack_valid(data_cn_ssr.y) ?
                                      stack_load_float3(stack, data_cn_ssr.y) :
                                      make_float3(1.0f, 1.0f, 1.0f);
+      float subsurface_ior = stack_valid(data_cn_ssr.z) ? stack_load_float(stack, data_cn_ssr.z) :
+                                                          1.4f;
+      float subsurface_anisotropy = stack_valid(data_cn_ssr.w) ?
+                                        stack_load_float(stack, data_cn_ssr.w) :
+                                        0.0f;
 
       // get the subsurface color
-      uint4 data_subsurface_color = read_node(kg, offset);
+      uint4 data_subsurface_color = read_node(kg, &offset);
       float3 subsurface_color = stack_valid(data_subsurface_color.x) ?
                                     stack_load_float3(stack, data_subsurface_color.x) :
                                     make_float3(__uint_as_float(data_subsurface_color.y),
@@ -222,16 +223,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
           if (bssrdf) {
             bssrdf->radius = subsurface_radius * subsurface;
-            bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID) ?
-                                 subsurface_color :
-                                 mixed_ss_base_color;
-            bssrdf->texture_blur = 0.0f;
-            bssrdf->sharpness = 0.0f;
+            bssrdf->albedo = mixed_ss_base_color;
             bssrdf->N = N;
             bssrdf->roughness = roughness;
 
+            /* Clamps protecting against bad/extreme and non physical values. */
+            subsurface_ior = clamp(subsurface_ior, 1.01f, 3.8f);
+            bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
             /* setup bsdf */
-            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method);
+            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method, subsurface_ior);
           }
         }
       }
@@ -733,9 +734,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     }
 #ifdef __HAIR__
     case CLOSURE_BSDF_HAIR_PRINCIPLED_ID: {
-      uint4 data_node2 = read_node(kg, offset);
-      uint4 data_node3 = read_node(kg, offset);
-      uint4 data_node4 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
+      uint4 data_node3 = read_node(kg, &offset);
+      uint4 data_node4 = read_node(kg, &offset);
 
       float3 weight = sd->svm_closure_weight * mix_weight;
 
@@ -878,10 +879,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 #endif /* __HAIR__ */
 
 #ifdef __SUBSURFACE__
-    case CLOSURE_BSSRDF_CUBIC_ID:
-    case CLOSURE_BSSRDF_GAUSSIAN_ID:
-    case CLOSURE_BSSRDF_BURLEY_ID:
-    case CLOSURE_BSSRDF_RANDOM_WALK_ID: {
+    case CLOSURE_BSSRDF_RANDOM_WALK_ID:
+    case CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID: {
       float3 weight = sd->svm_closure_weight * mix_weight;
       Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
 
@@ -894,11 +893,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
         bssrdf->radius = stack_load_float3(stack, data_node.z) * param1;
         bssrdf->albedo = sd->svm_closure_weight;
-        bssrdf->texture_blur = param2;
-        bssrdf->sharpness = stack_load_float(stack, data_node.w);
         bssrdf->N = N;
-        bssrdf->roughness = 0.0f;
-        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+        bssrdf->roughness = FLT_MAX;
+
+        const float subsurface_ior = clamp(param2, 1.01f, 3.8f);
+        const float subsurface_anisotropy = stack_load_float(stack, data_node.w);
+        bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
+        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, subsurface_ior);
       }
 
       break;
@@ -907,10 +909,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     default:
       break;
   }
+
+  return offset;
 }
 
-ccl_device void svm_node_closure_volume(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type)
+template<ShaderType shader_type>
+ccl_device_noinline void svm_node_closure_volume(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint4 node)
 {
 #ifdef __VOLUME__
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
@@ -961,21 +968,17 @@ ccl_device void svm_node_closure_volume(
 #endif
 }
 
-ccl_device void svm_node_principled_volume(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           float *stack,
-                                           uint4 node,
-                                           ShaderType shader_type,
-                                           int path_flag,
-                                           int *offset)
+template<ShaderType shader_type>
+ccl_device_noinline int svm_node_principled_volume(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
 #ifdef __VOLUME__
-  uint4 value_node = read_node(kg, offset);
-  uint4 attr_node = read_node(kg, offset);
+  uint4 value_node = read_node(kg, &offset);
+  uint4 attr_node = read_node(kg, &offset);
 
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
   if (shader_type != SHADER_TYPE_VOLUME) {
-    return;
+    return offset;
   }
 
   uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset;
@@ -985,7 +988,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
                                                        1.0f);
 
   if (mix_weight == 0.0f) {
-    return;
+    return offset;
   }
 
   /* Compute density. */
@@ -1034,7 +1037,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
   /* Compute emission. */
   if (path_flag & PATH_RAY_SHADOW) {
     /* Don't need emission for shadows. */
-    return;
+    return offset;
   }
 
   uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset;
@@ -1074,9 +1077,10 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
     }
   }
 #endif
+  return offset;
 }
 
-ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1093,7 +1097,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
   emission_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1110,7 +1114,7 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
   background_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
 
@@ -1145,14 +1149,13 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint
 ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
 {
   float3 weight = stack_load_float3(stack, weight_offset);
-
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_emission_weight(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_emission_weight(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint color_offset = node.y;
   uint strength_offset = node.z;
@@ -1163,7 +1166,7 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg,
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 {
   /* fetch weight from blend input, previous mix closures,
    * and write to stack to be used by closure nodes later */
@@ -1186,7 +1189,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 /* (Bump) normal */
 
 ccl_device void svm_node_set_normal(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
   float3 normal = stack_load_float3(stack, in_direction);
   sd->N = normal;
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 5df6c9fb755..37d40167ccc 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Conversion Nodes */
 
-ccl_device void svm_node_convert(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
+ccl_device_noinline void svm_node_convert(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
 {
   switch (type) {
     case NODE_CONVERT_FI: {
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 250fac6bcb8..a1d952173d8 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -14,11 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Bump Node */
 
-ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_set_bump(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint4 node)
 {
 #ifdef __RAY_DIFFERENTIALS__
   /* get normal input */
@@ -83,7 +88,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 
 /* Displacement Node */
 
-ccl_device void svm_node_set_displacement(KernelGlobals *kg,
+ccl_device void svm_node_set_displacement(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           float *stack,
                                           uint fac_offset)
@@ -92,7 +97,10 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg,
   sd->P += dP;
 }
 
-ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_displacement(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   uint height_offset, midlevel_offset, scale_offset, normal_offset;
   svm_unpack_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
@@ -119,10 +127,10 @@ ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *
   stack_store_float3(stack, node.z, dP);
 }
 
-ccl_device void svm_node_vector_displacement(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_vector_displacement(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
   uint space = data_node.x;
 
   uint vector_offset, midlevel_offset, scale_offset, displacement_offset;
@@ -164,6 +172,7 @@ ccl_device void svm_node_vector_displacement(
   }
 
   stack_store_float3(stack, displacement_offset, dP);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 96d602e35bf..b5ecdbe2abf 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Fresnel Node */
 
-ccl_device void svm_node_fresnel(
+ccl_device_noinline void svm_node_fresnel(
     ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
 {
   uint normal_offset, out_offset;
@@ -37,7 +37,7 @@ ccl_device void svm_node_fresnel(
 
 /* Layer Weight Node */
 
-ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 {
   uint blend_offset = node.y;
   uint blend_value = node.z;
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index 65eb08eb0eb..f6fafdee941 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_gamma(
+ccl_device_noinline void svm_node_gamma(
     ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color)
 {
   float3 color = stack_load_float3(stack, in_color);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index e48e96dcfa4..10e9f291d0e 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Geometry Node */
 
-ccl_device_inline void svm_node_geometry(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float3 data;
 
@@ -51,8 +51,8 @@ ccl_device_inline void svm_node_geometry(
   stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device void svm_node_geometry_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -75,8 +75,8 @@ ccl_device void svm_node_geometry_bump_dx(
 #endif
 }
 
-ccl_device void svm_node_geometry_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -101,8 +101,8 @@ ccl_device void svm_node_geometry_bump_dy(
 
 /* Object Info */
 
-ccl_device void svm_node_object_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_object_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
 
@@ -140,8 +140,8 @@ ccl_device void svm_node_object_info(
 
 /* Particle Info */
 
-ccl_device void svm_node_particle_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_particle_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   switch (type) {
     case NODE_INFO_PAR_INDEX: {
@@ -199,8 +199,8 @@ ccl_device void svm_node_particle_info(
 
 /* Hair Info */
 
-ccl_device void svm_node_hair_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_hair_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
   float3 data3;
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index 08304bc47e8..cd15f7097e7 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -60,7 +60,7 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type)
   return 0.0f;
 }
 
-ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 {
   uint type, co_offset, color_offset, fac_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index c299cf58c7f..6f49a8385aa 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -19,8 +19,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_hsv(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_hsv(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint in_color_offset, fac_offset, out_color_offset;
   uint hue_offset, sat_offset, val_offset;
diff --git a/intern/cycles/kernel/svm/svm_ies.h b/intern/cycles/kernel/svm/svm_ies.h
index 56c804b44d0..9c13734ecf0 100644
--- a/intern/cycles/kernel/svm/svm_ies.h
+++ b/intern/cycles/kernel/svm/svm_ies.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 /* IES Light */
 
 ccl_device_inline float interpolate_ies_vertical(
-    KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
+    const KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
 {
   /* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end
    * of v (corresponding to the north pole) would result in artifacts. The proper way of dealing
@@ -39,7 +39,7 @@ ccl_device_inline float interpolate_ies_vertical(
   return cubic_interp(a, b, c, d, v_frac);
 }
 
-ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
+ccl_device_inline float kernel_ies_interp(const KernelGlobals *kg,
                                           int slot,
                                           float h_angle,
                                           float v_angle)
@@ -98,8 +98,10 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
   return max(cubic_interp(a, b, c, d, h_frac), 0.0f);
 }
 
-ccl_device void svm_node_ies(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_ies(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint vector_offset, strength_offset, fac_offset, slot = node.z;
   svm_unpack_node_uchar3(node.y, &strength_offset, &vector_offset, &fac_offset);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 9348ddabde5..a344f36977a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags)
+ccl_device float4 svm_image_texture(const KernelGlobals *kg, int id, float x, float y, uint flags)
 {
   if (id == -1) {
     return make_float4(
@@ -44,8 +44,8 @@ ccl_device_inline float3 texco_remap_square(float3 co)
   return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
 }
 
-ccl_device void svm_node_tex_image(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_image(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, out_offset, alpha_offset, flags;
 
@@ -71,7 +71,7 @@ ccl_device void svm_node_tex_image(
   int num_nodes = (int)node.y;
   if (num_nodes > 0) {
     /* Remember the offset of the node following the tile nodes. */
-    int next_offset = (*offset) + num_nodes;
+    int next_offset = offset + num_nodes;
 
     /* Find the tile that the UV lies in. */
     int tx = (int)tex_co.x;
@@ -83,7 +83,7 @@ ccl_device void svm_node_tex_image(
 
       /* Find the index of the tile. */
       for (int i = 0; i < num_nodes; i++) {
-        uint4 tile_node = read_node(kg, offset);
+        uint4 tile_node = read_node(kg, &offset);
         if (tile_node.x == tile) {
           id = tile_node.y;
           break;
@@ -102,7 +102,7 @@ ccl_device void svm_node_tex_image(
     }
 
     /* Skip over the remaining nodes. */
-    *offset = next_offset;
+    offset = next_offset;
   }
   else {
     id = -num_nodes;
@@ -114,9 +114,13 @@ ccl_device void svm_node_tex_image(
     stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, f.w);
+  return offset;
 }
 
-ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_image_box(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float *stack,
+                                                uint4 node)
 {
   /* get object space normal */
   float3 N = sd->N;
@@ -215,10 +219,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
     stack_store_float(stack, alpha_offset, f.w);
 }
 
-ccl_device void svm_node_tex_environment(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_tex_environment(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint id = node.y;
   uint co_offset, out_offset, alpha_offset, flags;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index 02024742b13..27cdaaff473 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -21,7 +21,7 @@ ccl_device float invert(float color, float factor)
   return factor * (1.0f - color) + (1.0f - factor) * color;
 }
 
-ccl_device void svm_node_invert(
+ccl_device_noinline void svm_node_invert(
     ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color)
 {
   float factor = stack_load_float(stack, in_fac);
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 768c65918cd..49fabad1cc5 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Light Path Node */
 
-ccl_device void svm_node_light_path(ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    float *stack,
-                                    uint type,
-                                    uint out_offset,
-                                    int path_flag)
+ccl_device_noinline void svm_node_light_path(INTEGRATOR_STATE_CONST_ARGS,
+                                             const ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint out_offset,
+                                             int path_flag)
 {
   float info = 0.0f;
 
@@ -58,21 +58,47 @@ ccl_device void svm_node_light_path(ShaderData *sd,
     case NODE_LP_ray_length:
       info = sd->ray_length;
       break;
-    case NODE_LP_ray_depth:
-      info = (float)state->bounce;
+    case NODE_LP_ray_depth: {
+      /* Read bounce from difference location depending if this is a shadow
+       * path. It's a bit dubious to have integrate state details leak into
+       * this function but hard to avoid currently. */
+      int bounce = (INTEGRATOR_STATE_IS_NULL)    ? 0 :
+                   (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(shadow_path, bounce) :
+                                                   INTEGRATOR_STATE(path, bounce);
+
+      /* For background, light emission and shadow evaluation we from a
+       * surface or volume we are effective one bounce further. */
+      if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+        bounce++;
+      }
+
+      info = (float)bounce;
       break;
+    }
+      /* TODO */
+    case NODE_LP_ray_transparent: {
+      const int bounce = (INTEGRATOR_STATE_IS_NULL) ?
+                             0 :
+                         (path_flag & PATH_RAY_SHADOW) ?
+                             INTEGRATOR_STATE(shadow_path, transparent_bounce) :
+                             INTEGRATOR_STATE(path, transparent_bounce);
+
+      info = (float)bounce;
+      break;
+    }
+#if 0
     case NODE_LP_ray_diffuse:
       info = (float)state->diffuse_bounce;
       break;
     case NODE_LP_ray_glossy:
       info = (float)state->glossy_bounce;
       break;
-    case NODE_LP_ray_transparent:
-      info = (float)state->transparent_bounce;
-      break;
+#endif
+#if 0
     case NODE_LP_ray_transmission:
       info = (float)state->transmission_bounce;
       break;
+#endif
   }
 
   stack_store_float(stack, out_offset, info);
@@ -80,7 +106,7 @@ ccl_device void svm_node_light_path(ShaderData *sd,
 
 /* Light Falloff Node */
 
-ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 {
   uint strength_offset, out_offset, smooth_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index 9c160e6d8cc..8784c760860 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -87,8 +87,8 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
   return make_float3(0.5f - x, 0.5f - y, 0.5f - z);
 }
 
-ccl_device void svm_node_tex_magic(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_magic(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint depth;
   uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset;
@@ -96,7 +96,7 @@ ccl_device void svm_node_tex_magic(
   svm_unpack_node_uchar3(node.y, &depth, &color_offset, &fac_offset);
   svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset);
 
-  uint4 node2 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
   float3 co = stack_load_float3(stack, co_offset);
   float scale = stack_load_float_default(stack, scale_offset, node2.x);
   float distortion = stack_load_float_default(stack, distortion_offset, node2.y);
@@ -107,6 +107,7 @@ ccl_device void svm_node_tex_magic(
     stack_store_float(stack, fac_offset, average(color));
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, color);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_map_range.h b/intern/cycles/kernel/svm/svm_map_range.h
index 533a631c837..c8684981e31 100644
--- a/intern/cycles/kernel/svm/svm_map_range.h
+++ b/intern/cycles/kernel/svm/svm_map_range.h
@@ -24,13 +24,13 @@ ccl_device_inline float smootherstep(float edge0, float edge1, float x)
   return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
 }
 
-ccl_device void svm_node_map_range(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint value_stack_offset,
-                                   uint parameters_stack_offsets,
-                                   uint results_stack_offsets,
-                                   int *offset)
+ccl_device_noinline int svm_node_map_range(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint value_stack_offset,
+                                           uint parameters_stack_offsets,
+                                           uint results_stack_offsets,
+                                           int offset)
 {
   uint from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset;
   uint type_stack_offset, steps_stack_offset, result_stack_offset;
@@ -42,8 +42,8 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
   svm_unpack_node_uchar3(
       results_stack_offsets, &type_stack_offset, &steps_stack_offset, &result_stack_offset);
 
-  uint4 defaults = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float from_min = stack_load_float_default(stack, from_min_stack_offset, defaults.x);
@@ -83,6 +83,7 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
     result = 0.0f;
   }
   stack_store_float(stack, result_stack_offset, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index 6e19c859e19..fcc724405f5 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -18,13 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Mapping Node */
 
-ccl_device void svm_node_mapping(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float *stack,
-                                 uint type,
-                                 uint inputs_stack_offsets,
-                                 uint result_stack_offset,
-                                 int *offset)
+ccl_device_noinline void svm_node_mapping(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint type,
+                                          uint inputs_stack_offsets,
+                                          uint result_stack_offset)
 {
   uint vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset;
   svm_unpack_node_uchar4(inputs_stack_offsets,
@@ -44,30 +43,40 @@ ccl_device void svm_node_mapping(KernelGlobals *kg,
 
 /* Texture Mapping */
 
-ccl_device void svm_node_texture_mapping(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_texture_mapping(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint vec_offset,
+                                                 uint out_offset,
+                                                 int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
   Transform tfm;
-  tfm.x = read_node_float(kg, offset);
-  tfm.y = read_node_float(kg, offset);
-  tfm.z = read_node_float(kg, offset);
+  tfm.x = read_node_float(kg, &offset);
+  tfm.y = read_node_float(kg, &offset);
+  tfm.z = read_node_float(kg, &offset);
 
   float3 r = transform_point(&tfm, v);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
-ccl_device void svm_node_min_max(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_min_max(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint vec_offset,
+                                         uint out_offset,
+                                         int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
-  float3 mn = float4_to_float3(read_node_float(kg, offset));
-  float3 mx = float4_to_float3(read_node_float(kg, offset));
+  float3 mn = float4_to_float3(read_node_float(kg, &offset));
+  float3 mx = float4_to_float3(read_node_float(kg, &offset));
 
   float3 r = min(max(mn, v), mx);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 733ea28f9e5..99e7a8f2bda 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_math(KernelGlobals *kg,
-                              ShaderData *sd,
-                              float *stack,
-                              uint type,
-                              uint inputs_stack_offsets,
-                              uint result_stack_offset,
-                              int *offset)
+ccl_device_noinline void svm_node_math(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint type,
+                                       uint inputs_stack_offsets,
+                                       uint result_stack_offset)
 {
   uint a_stack_offset, b_stack_offset, c_stack_offset;
   svm_unpack_node_uchar3(inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &c_stack_offset);
@@ -35,13 +34,13 @@ ccl_device void svm_node_math(KernelGlobals *kg,
   stack_store_float(stack, result_stack_offset, result);
 }
 
-ccl_device void svm_node_vector_math(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint type,
-                                     uint inputs_stack_offsets,
-                                     uint outputs_stack_offsets,
-                                     int *offset)
+ccl_device_noinline int svm_node_vector_math(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint inputs_stack_offsets,
+                                             uint outputs_stack_offsets,
+                                             int offset)
 {
   uint value_stack_offset, vector_stack_offset;
   uint a_stack_offset, b_stack_offset, param1_stack_offset;
@@ -60,7 +59,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
   /* 3 Vector Operators */
   if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD ||
       type == NODE_VECTOR_MATH_MULTIPLY_ADD) {
-    uint4 extra_node = read_node(kg, offset);
+    uint4 extra_node = read_node(kg, &offset);
     c = stack_load_float3(stack, extra_node.x);
   }
 
@@ -70,6 +69,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
     stack_store_float(stack, value_stack_offset, value);
   if (stack_valid(vector_stack_offset))
     stack_store_float3(stack, vector_stack_offset, vector);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index 15114bfd5e4..3e38080977f 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -18,16 +18,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Node */
 
-ccl_device void svm_node_mix(KernelGlobals *kg,
-                             ShaderData *sd,
-                             float *stack,
-                             uint fac_offset,
-                             uint c1_offset,
-                             uint c2_offset,
-                             int *offset)
+ccl_device_noinline int svm_node_mix(const KernelGlobals *kg,
+                                     ShaderData *sd,
+                                     float *stack,
+                                     uint fac_offset,
+                                     uint c1_offset,
+                                     uint c2_offset,
+                                     int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
 
   float fac = stack_load_float(stack, fac_offset);
   float3 c1 = stack_load_float3(stack, c1_offset);
@@ -35,6 +35,7 @@ ccl_device void svm_node_mix(KernelGlobals *kg,
   float3 result = svm_mix((NodeMix)node1.y, fac, c1, c2);
 
   stack_store_float3(stack, node1.z, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 571f62fe27f..03a8b68b3ef 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -700,13 +700,13 @@ ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_4d(
   return value;
 }
 
-ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint offsets1,
-                                      uint offsets2,
-                                      uint offsets3,
-                                      int *offset)
+ccl_device_noinline int svm_node_tex_musgrave(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint offsets1,
+                                              uint offsets2,
+                                              uint offsets3,
+                                              int offset)
 {
   uint type, dimensions, co_stack_offset, w_stack_offset;
   uint scale_stack_offset, detail_stack_offset, dimension_stack_offset, lacunarity_stack_offset;
@@ -720,8 +720,8 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
                          &lacunarity_stack_offset);
   svm_unpack_node_uchar3(offsets3, &offset_stack_offset, &gain_stack_offset, &fac_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 co = stack_load_float3(stack, co_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -844,6 +844,7 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
   }
 
   stack_store_float(stack, fac_stack_offset, fac);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 94d8bfde555..ecb4df6afdf 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -330,7 +330,7 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
  *               |__________________________|
  *
  */
-ccl_device_noinline float perlin_2d(float x, float y)
+ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
   ssei XY;
   ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
@@ -447,7 +447,7 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
  *     v7      (1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -501,7 +501,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
@@ -585,7 +585,7 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
  *                 |__________________________|
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -637,7 +637,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 61fd9553802..29b262ac06e 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -140,13 +140,13 @@ ccl_device void noise_texture_4d(float4 co,
   }
 }
 
-ccl_device void svm_node_tex_noise(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint dimensions,
-                                   uint offsets1,
-                                   uint offsets2,
-                                   int *offset)
+ccl_device_noinline int svm_node_tex_noise(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint dimensions,
+                                           uint offsets1,
+                                           uint offsets2,
+                                           int offset)
 {
   uint vector_stack_offset, w_stack_offset, scale_stack_offset;
   uint detail_stack_offset, roughness_stack_offset, distortion_stack_offset;
@@ -160,8 +160,8 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
                          &value_stack_offset,
                          &color_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 vector = stack_load_float3(stack, vector_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -212,6 +212,7 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
   if (stack_valid(color_stack_offset)) {
     stack_store_float3(stack, color_stack_offset, color);
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 4cd3eab0ed2..724b5f281f9 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -16,16 +16,16 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_normal(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint in_normal_offset,
-                                uint out_normal_offset,
-                                uint out_dot_offset,
-                                int *offset)
+ccl_device_noinline int svm_node_normal(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        float *stack,
+                                        uint in_normal_offset,
+                                        uint out_normal_offset,
+                                        uint out_dot_offset,
+                                        int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 normal = stack_load_float3(stack, in_normal_offset);
 
   float3 direction;
@@ -39,6 +39,7 @@ ccl_device void svm_node_normal(KernelGlobals *kg,
 
   if (stack_valid(out_dot_offset))
     stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 85ccf39144b..e92df3c093c 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -21,8 +21,12 @@ CCL_NAMESPACE_BEGIN
 
 /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
 
-ccl_device_inline float4 rgb_ramp_lookup(
-    KernelGlobals *kg, int offset, float f, bool interpolate, bool extrapolate, int table_size)
+ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg,
+                                         int offset,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
 {
   if ((f < 0.0f || f > 1.0f) && extrapolate) {
     float4 t0, dy;
@@ -53,34 +57,35 @@ ccl_device_inline float4 rgb_ramp_lookup(
   return a;
 }
 
-ccl_device void svm_node_rgb_ramp(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_rgb_ramp(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, alpha_offset;
   uint interpolate = node.z;
 
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &alpha_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
-  float4 color = rgb_ramp_lookup(kg, *offset, fac, interpolate, false, table_size);
+  float4 color = rgb_ramp_lookup(kg, offset, fac, interpolate, false, table_size);
 
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, float4_to_float3(color));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, color.w);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
-ccl_device void svm_node_curves(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_curves(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, out_offset;
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &out_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
   float3 color = stack_load_float3(stack, color_offset);
@@ -89,14 +94,15 @@ ccl_device void svm_node_curves(
   const float range_x = max_x - min_x;
   const float3 relpos = (color - make_float3(min_x, min_x, min_x)) / range_x;
 
-  float r = rgb_ramp_lookup(kg, *offset, relpos.x, true, true, table_size).x;
-  float g = rgb_ramp_lookup(kg, *offset, relpos.y, true, true, table_size).y;
-  float b = rgb_ramp_lookup(kg, *offset, relpos.z, true, true, table_size).z;
+  float r = rgb_ramp_lookup(kg, offset, relpos.x, true, true, table_size).x;
+  float g = rgb_ramp_lookup(kg, offset, relpos.y, true, true, table_size).y;
+  float b = rgb_ramp_lookup(kg, offset, relpos.z, true, true, table_size).z;
 
   color = (1.0f - fac) * color + fac * make_float3(r, g, b);
   stack_store_float3(stack, out_offset, color);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index f501252062e..8d52845ea3d 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -16,15 +16,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint hue_in,
-                                     uint saturation_in,
-                                     uint value_in,
-                                     int *offset)
+ccl_device_noinline int svm_node_combine_hsv(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint hue_in,
+                                             uint saturation_in,
+                                             uint value_in,
+                                             int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint color_out = node1.y;
 
   float hue = stack_load_float(stack, hue_in);
@@ -36,17 +36,18 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
 
   if (stack_valid(color_out))
     stack_store_float3(stack, color_out, color);
+  return offset;
 }
 
-ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint color_in,
-                                      uint hue_out,
-                                      uint saturation_out,
-                                      int *offset)
+ccl_device_noinline int svm_node_separate_hsv(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint color_in,
+                                              uint hue_out,
+                                              uint saturation_out,
+                                              int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint value_out = node1.y;
 
   float3 color = stack_load_float3(stack, color_in);
@@ -60,6 +61,7 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
     stack_store_float(stack, saturation_out, color.y);
   if (stack_valid(value_out))
     stack_store_float(stack, value_out, color.z);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index b908732f026..b77c4311e72 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -37,7 +37,7 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-ccl_device float3 sky_radiance_preetham(KernelGlobals *kg,
+ccl_device float3 sky_radiance_preetham(const KernelGlobals *kg,
                                         float3 dir,
                                         float sunphi,
                                         float suntheta,
@@ -90,7 +90,7 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
           configuration[6] * mieM + configuration[7] * zenith);
 }
 
-ccl_device float3 sky_radiance_hosek(KernelGlobals *kg,
+ccl_device float3 sky_radiance_hosek(const KernelGlobals *kg,
                                      float3 dir,
                                      float sunphi,
                                      float suntheta,
@@ -127,7 +127,7 @@ ccl_device float3 geographical_to_direction(float lat, float lon)
   return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
 }
 
-ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
+ccl_device float3 sky_radiance_nishita(const KernelGlobals *kg,
                                        float3 dir,
                                        float *nishita_data,
                                        uint texture_id)
@@ -209,8 +209,8 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
   return xyz_to_rgb(kg, xyz);
 }
 
-ccl_device void svm_node_tex_sky(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_sky(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   /* Load data */
   uint dir_offset = node.y;
@@ -226,49 +226,49 @@ ccl_device void svm_node_tex_sky(
     float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
     float config_x[9], config_y[9], config_z[9];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     sunphi = data.x;
     suntheta = data.y;
     radiance_x = data.z;
     radiance_y = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     radiance_z = data.x;
     config_x[0] = data.y;
     config_x[1] = data.z;
     config_x[2] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[3] = data.x;
     config_x[4] = data.y;
     config_x[5] = data.z;
     config_x[6] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[7] = data.x;
     config_x[8] = data.y;
     config_y[0] = data.z;
     config_y[1] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[2] = data.x;
     config_y[3] = data.y;
     config_y[4] = data.z;
     config_y[5] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[6] = data.x;
     config_y[7] = data.y;
     config_y[8] = data.z;
     config_z[0] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[1] = data.x;
     config_z[2] = data.y;
     config_z[3] = data.z;
     config_z[4] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[5] = data.x;
     config_z[6] = data.y;
     config_z[7] = data.z;
@@ -305,19 +305,19 @@ ccl_device void svm_node_tex_sky(
     /* Define variables */
     float nishita_data[10];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     nishita_data[0] = data.x;
     nishita_data[1] = data.y;
     nishita_data[2] = data.z;
     nishita_data[3] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[4] = data.x;
     nishita_data[5] = data.y;
     nishita_data[6] = data.z;
     nishita_data[7] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[8] = data.x;
     nishita_data[9] = data.y;
     uint texture_id = __float_as_uint(data.z);
@@ -327,6 +327,7 @@ ccl_device void svm_node_tex_sky(
   }
 
   stack_store_float3(stack, out_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 46600551cc4..a35253080da 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/geom/geom.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture Coordinate Node */
 
-ccl_device void svm_node_tex_coord(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
   float3 data;
   uint type = node.y;
@@ -35,9 +39,9 @@ ccl_device void svm_node_tex_coord(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -92,10 +96,11 @@ ccl_device void svm_node_tex_coord(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 }
 
-ccl_device void svm_node_tex_coord_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -112,9 +117,9 @@ ccl_device void svm_node_tex_coord_bump_dx(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -136,7 +141,7 @@ ccl_device void svm_node_tex_coord_bump_dx(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
       data.z = 0.0f;
@@ -169,13 +174,14 @@ ccl_device void svm_node_tex_coord_bump_dx(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_tex_coord_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -192,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -216,7 +222,7 @@ ccl_device void svm_node_tex_coord_bump_dy(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
       data.z = 0.0f;
@@ -249,12 +255,16 @@ ccl_device void svm_node_tex_coord_bump_dy(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_normal_map(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint4 node)
 {
   uint color_offset, strength_offset, normal_offset, space;
   svm_unpack_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
@@ -346,7 +356,10 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
   stack_store_float3(stack, normal_offset, N);
 }
 
-ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tangent(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint4 node)
 {
   uint tangent_offset, direction_type, axis;
   svm_unpack_node_uchar3(node.y, &tangent_offset, &direction_type, &axis);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 062afcfa5ac..c053be96c51 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -30,37 +30,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Nodes */
 
-/* Known frequencies of used nodes, used for selective nodes compilation
- * in the kernel. Currently only affects split OpenCL kernel.
- *
- * Keep as defines so it's easy to check which nodes are to be compiled
- * from preprocessor.
- *
- * Lower the number of group more often the node is used.
- */
-#define NODE_GROUP_LEVEL_0 0
-#define NODE_GROUP_LEVEL_1 1
-#define NODE_GROUP_LEVEL_2 2
-#define NODE_GROUP_LEVEL_3 3
-#define NODE_GROUP_LEVEL_4 4
-#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4
-
-#define NODE_FEATURE_VOLUME (1 << 0)
-#define NODE_FEATURE_HAIR (1 << 1)
-#define NODE_FEATURE_BUMP (1 << 2)
-#define NODE_FEATURE_BUMP_STATE (1 << 3)
-#define NODE_FEATURE_VORONOI_EXTRA (1 << 4)
-/* TODO(sergey): Consider using something like ((uint)(-1)).
- * Need to check carefully operand types around usage of this
- * define first.
- */
-#define NODE_FEATURE_ALL \
-  (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE | \
-   NODE_FEATURE_VORONOI_EXTRA)
-
-#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
-#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
-
 typedef enum ShaderNodeType {
   NODE_END = 0,
   NODE_SHADER_JUMP,
@@ -572,12 +541,8 @@ typedef enum ClosureType {
   CLOSURE_BSDF_TRANSPARENT_ID,
 
   /* BSSRDF */
-  CLOSURE_BSSRDF_CUBIC_ID,
-  CLOSURE_BSSRDF_GAUSSIAN_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_ID,
-  CLOSURE_BSSRDF_BURLEY_ID,
   CLOSURE_BSSRDF_RANDOM_WALK_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID,
+  CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID,
 
   /* Other */
   CLOSURE_HOLDOUT_ID,
@@ -620,11 +585,9 @@ typedef enum ClosureType {
    type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
-#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
-#define CLOSURE_IS_DISK_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
+  (type >= CLOSURE_BSSRDF_RANDOM_WALK_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_VOLUME(type) \
   (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
 #define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index 5b76f2c8832..d0478660094 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -19,20 +19,21 @@ CCL_NAMESPACE_BEGIN
 /* Value Nodes */
 
 ccl_device void svm_node_value_f(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
 {
   stack_store_float(stack, out_offset, __uint_as_float(ivalue));
 }
 
-ccl_device void svm_node_value_v(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset)
+ccl_device int svm_node_value_v(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 p = make_float3(
       __uint_as_float(node1.y), __uint_as_float(node1.z), __uint_as_float(node1.w));
 
   stack_store_float3(stack, out_offset, p);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h
index 50045752484..55e1bce0158 100644
--- a/intern/cycles/kernel/svm/svm_vector_rotate.h
+++ b/intern/cycles/kernel/svm/svm_vector_rotate.h
@@ -18,11 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Rotate */
 
-ccl_device void svm_node_vector_rotate(ShaderData *sd,
-                                       float *stack,
-                                       uint input_stack_offsets,
-                                       uint axis_stack_offsets,
-                                       uint result_stack_offset)
+ccl_device_noinline void svm_node_vector_rotate(ShaderData *sd,
+                                                float *stack,
+                                                uint input_stack_offsets,
+                                                uint axis_stack_offsets,
+                                                uint result_stack_offset)
 {
   uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset,
       angle_stack_offset, invert;
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 1e95492cf1b..8aedb7e0f54 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Transform */
 
-ccl_device void svm_node_vector_transform(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          float *stack,
-                                          uint4 node)
+ccl_device_noinline void svm_node_vector_transform(const KernelGlobals *kg,
+                                                   ShaderData *sd,
+                                                   float *stack,
+                                                   uint4 node)
 {
   uint itype, ifrom, ito;
   uint vector_in, vector_out;
diff --git a/intern/cycles/kernel/svm/svm_vertex_color.h b/intern/cycles/kernel/svm/svm_vertex_color.h
index 0aa45835522..986ea244f3a 100644
--- a/intern/cycles/kernel/svm/svm_vertex_color.h
+++ b/intern/cycles/kernel/svm/svm_vertex_color.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_vertex_color(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint layer_id,
-                                      uint color_offset,
-                                      uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint layer_id,
+                                               uint color_offset,
+                                               uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -35,18 +35,12 @@ ccl_device void svm_node_vertex_color(KernelGlobals *kg,
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dx(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dx(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -62,18 +56,12 @@ ccl_device_noinline
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dy(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dy(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index d0e7db35fab..b1d2eff7f37 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -902,16 +902,17 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float
   *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
 }
 
-ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint dimensions,
-                                     uint feature,
-                                     uint metric,
-                                     int *offset)
+template<uint node_feature_mask>
+ccl_device_noinline int svm_node_tex_voronoi(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint dimensions,
+                                             uint feature,
+                                             uint metric,
+                                             int offset)
 {
-  uint4 stack_offsets = read_node(kg, offset);
-  uint4 defaults = read_node(kg, offset);
+  uint4 stack_offsets = read_node(kg, &offset);
+  uint4 defaults = read_node(kg, &offset);
 
   uint coord_stack_offset, w_stack_offset, scale_stack_offset, smoothness_stack_offset;
   uint exponent_stack_offset, randomness_stack_offset, distance_out_stack_offset,
@@ -997,18 +998,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out_2d);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_2d(coord_2d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_2d);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_2d(coord_2d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_2d);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_2d(coord_2d,
                         exponent,
@@ -1042,18 +1043,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_3d(coord,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_3d(coord,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_3d(coord,
                         exponent,
@@ -1076,54 +1077,54 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
       break;
     }
 
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
     case 4: {
-      float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
-      float4 position_out_4d;
-      switch (voronoi_feature) {
-        case NODE_VORONOI_F1:
-          voronoi_f1_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_4d(coord_4d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_4d);
-          break;
-        case NODE_VORONOI_F2:
-          voronoi_f2_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_DISTANCE_TO_EDGE:
-          voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
-          break;
-        case NODE_VORONOI_N_SPHERE_RADIUS:
-          voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
-          break;
-        default:
-          kernel_assert(0);
+      if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+        float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
+        float4 position_out_4d;
+        switch (voronoi_feature) {
+          case NODE_VORONOI_F1:
+            voronoi_f1_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_SMOOTH_F1:
+            voronoi_smooth_f1_4d(coord_4d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_4d);
+            break;
+          case NODE_VORONOI_F2:
+            voronoi_f2_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_DISTANCE_TO_EDGE:
+            voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
+            break;
+          case NODE_VORONOI_N_SPHERE_RADIUS:
+            voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
+            break;
+          default:
+            kernel_assert(0);
+        }
+        position_out_4d = safe_divide_float4_float(position_out_4d, scale);
+        position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
+        w_out = position_out_4d.w;
       }
-      position_out_4d = safe_divide_float4_float(position_out_4d, scale);
-      position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
-      w_out = position_out_4d.w;
       break;
     }
-#endif
     default:
       kernel_assert(0);
   }
@@ -1138,6 +1139,7 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
     stack_store_float(stack, w_out_stack_offset, w_out);
   if (stack_valid(radius_out_stack_offset))
     stack_store_float(stack, radius_out_stack_offset, radius_out);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 4bc14f82382..78b75405356 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -19,8 +19,8 @@ CCL_NAMESPACE_BEGIN
 /* TODO(sergey): Think of making it more generic volume-type attribute
  * sampler.
  */
-ccl_device void svm_node_tex_voxel(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_voxel(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, density_out_offset, color_out_offset, space;
   svm_unpack_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
@@ -33,9 +33,9 @@ ccl_device void svm_node_tex_voxel(
   else {
     kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD);
     Transform tfm;
-    tfm.x = read_node_float(kg, offset);
-    tfm.y = read_node_float(kg, offset);
-    tfm.z = read_node_float(kg, offset);
+    tfm.x = read_node_float(kg, &offset);
+    tfm.y = read_node_float(kg, &offset);
+    tfm.z = read_node_float(kg, &offset);
     co = transform_point(&tfm, co);
   }
 
@@ -47,6 +47,7 @@ ccl_device void svm_node_tex_voxel(
     stack_store_float(stack, density_out_offset, r.w);
   if (stack_valid(color_out_offset))
     stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index c4763475b47..00f980c16df 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -82,11 +82,11 @@ ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
   }
 }
 
-ccl_device void svm_node_tex_wave(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_wave(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
 
   /* RNA properties */
   uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset;
@@ -125,6 +125,7 @@ ccl_device void svm_node_tex_wave(
     stack_store_float(stack, fac_offset, f);
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, make_float3(f, f, f));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index d6144802559..fba8aa63d31 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -69,8 +69,8 @@ ccl_static_constant float cie_colour_match[81][3] = {
     {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
     {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
 
-ccl_device void svm_node_wavelength(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
+ccl_device_noinline void svm_node_wavelength(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
 {
   float lambda_nm = stack_load_float(stack, wavelength);
   float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f);  // scaled 0..80
diff --git a/intern/cycles/kernel/svm/svm_white_noise.h b/intern/cycles/kernel/svm/svm_white_noise.h
index b30d85acaec..0306d2e7b9c 100644
--- a/intern/cycles/kernel/svm/svm_white_noise.h
+++ b/intern/cycles/kernel/svm/svm_white_noise.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_tex_white_noise(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint dimensions,
-                                         uint inputs_stack_offsets,
-                                         uint ouptuts_stack_offsets,
-                                         int *offset)
+ccl_device_noinline void svm_node_tex_white_noise(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint dimensions,
+                                                  uint inputs_stack_offsets,
+                                                  uint ouptuts_stack_offsets)
 {
   uint vector_stack_offset, w_stack_offset, value_stack_offset, color_stack_offset;
   svm_unpack_node_uchar2(inputs_stack_offsets, &vector_stack_offset, &w_stack_offset);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 49158bd86d5..7ec913789d2 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -35,7 +35,7 @@ CCL_NAMESPACE_BEGIN
 /* Wireframe Node */
 
 ccl_device_inline float wireframe(
-    KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
+    const KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
 {
 #ifdef __HAIR__
   if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
@@ -88,7 +88,10 @@ ccl_device_inline float wireframe(
   return 0.0f;
 }
 
-ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_wireframe(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint4 node)
 {
   uint in_size = node.y;
   uint out_fac = node.z;
@@ -100,18 +103,7 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
   int pixel_size = (int)use_pixel_size;
 
   /* Calculate wireframe */
-#ifdef __SPLIT_KERNEL__
-  /* TODO(sergey): This is because sd is actually a global space,
-   * which makes it difficult to re-use same wireframe() function.
-   *
-   * With OpenCL 2.0 it's possible to avoid this change, but for until
-   * then we'll be living with such an exception.
-   */
-  float3 P = sd->P;
-  float f = wireframe(kg, sd, size, pixel_size, &P);
-#else
   float f = wireframe(kg, sd, size, pixel_size, &sd->P);
-#endif
 
   /* TODO(sergey): Think of faster way to calculate derivatives. */
   if (bump_offset == NODE_BUMP_OFFSET_DX) {
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 18:59:20 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-21 15:55:54 +0300
commit	08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree	6fe7ab045f0dc0a423d6557c4073f34309ef4740 /intern/cycles/kernel
parent	fa6b1007bad065440950cd67deb16a04f368856f (diff)