Cycles: merge of cycles-x branch, a major update to the renderer

This includes much improved GPU rendering performance, viewport interactivity, new shadow catcher, revamped sampling settings, subsurface scattering anisotropy, new GPU volume sampling, improved PMJ sampling pattern, and more. Some features have also been removed or changed, breaking backwards compatibility. Including the removal of the OpenCL backend, for which alternatives are under development. Release notes and code docs: https://wiki.blender.org/wiki/Reference/Release_Notes/3.0/Cycles https://wiki.blender.org/wiki/Source/Render/Cycles Credits: * Sergey Sharybin * Brecht Van Lommel * Patrick Mours (OptiX backend) * Christophe Hery (subsurface scattering anisotropy) * William Leeson (PMJ sampling pattern) * Alaska (various fixes and tweaks) * Thomas Dinges (various fixes) For the full commit history, see the cycles-x branch. This squashes together all the changes since intermediate changes would often fail building or tests. Ref T87839, T87837, T87836 Fixes T90734, T89353, T80267, T80267, T77185, T69800
author: Brecht Van Lommel <brecht@blender.org> 2021-09-20 18:59:20 +0300
committer: Brecht Van Lommel <brecht@blender.org> 2021-09-21 15:55:54 +0300
commit: 08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree: 6fe7ab045f0dc0a423d6557c4073f34309ef4740
parent: fa6b1007bad065440950cd67deb16a04f368856f (diff)
544 files changed, 34049 insertions, 43427 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47712f0ac1e..8e807b84e22 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -403,7 +403,7 @@ option(WITH_CYCLES_CUDA_BINARIES    "Build Cycles CUDA binaries" OFF)
 option(WITH_CYCLES_CUBIN_COMPILER   "Build cubins with nvrtc based compiler instead of nvcc" OFF)
 option(WITH_CYCLES_CUDA_BUILD_SERIAL "Build cubins one after another (useful on machines with limited RAM)" OFF)
 mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
-set(CYCLES_TEST_DEVICES CPU CACHE STRING "Run regression tests on the specified device types (CPU CUDA OPTIX OPENCL)" )
+set(CYCLES_TEST_DEVICES CPU CACHE STRING "Run regression tests on the specified device types (CPU CUDA OPTIX)" )
 set(CYCLES_CUDA_BINARIES_ARCH sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 sm_70 sm_75 sm_86 compute_75 CACHE STRING "CUDA architectures to build binaries for")
 mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
 unset(PLATFORM_DEFAULT)
@@ -418,12 +418,8 @@ mark_as_advanced(WITH_CYCLES_DEBUG_NAN)
 mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
 
 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
-option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" OFF)
-option(WITH_CYCLES_DEVICE_OPENCL            "Enable Cycles OpenCL compute support" ON)
-option(WITH_CYCLES_NETWORK              "Enable Cycles compute over network support (EXPERIMENTAL and unfinished)" OFF)
+option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
-mark_as_advanced(WITH_CYCLES_DEVICE_OPENCL)
-mark_as_advanced(WITH_CYCLES_NETWORK)
 
 option(WITH_CUDA_DYNLOAD "Dynamically load CUDA libraries at runtime" ON)
 mark_as_advanced(WITH_CUDA_DYNLOAD)
diff --git a/build_files/cmake/Modules/FindOptiX.cmake b/build_files/cmake/Modules/FindOptiX.cmake
index cfcdd9cd23b..67106740f57 100644
--- a/build_files/cmake/Modules/FindOptiX.cmake
+++ b/build_files/cmake/Modules/FindOptiX.cmake
@@ -33,11 +33,23 @@ FIND_PATH(OPTIX_INCLUDE_DIR
     include
 )
 
+IF(EXISTS "${OPTIX_INCLUDE_DIR}/optix.h")
+  FILE(STRINGS "${OPTIX_INCLUDE_DIR}/optix.h" _optix_version REGEX "^#define OPTIX_VERSION[ \t].*$")
+  STRING(REGEX MATCHALL "[0-9]+" _optix_version ${_optix_version})
+
+  MATH(EXPR _optix_version_major "${_optix_version} / 10000")
+  MATH(EXPR _optix_version_minor "(${_optix_version} % 10000) / 100")
+  MATH(EXPR _optix_version_patch "${_optix_version} % 100")
+
+  SET(OPTIX_VERSION "${_optix_version_major}.${_optix_version_minor}.${_optix_version_patch}")
+ENDIF()
+
 # handle the QUIETLY and REQUIRED arguments and set OPTIX_FOUND to TRUE if
 # all listed variables are TRUE
 INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(OptiX DEFAULT_MSG
-    OPTIX_INCLUDE_DIR)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OptiX
+    REQUIRED_VARS OPTIX_INCLUDE_DIR
+    VERSION_VAR OPTIX_VERSION)
 
 IF(OPTIX_FOUND)
   SET(OPTIX_INCLUDE_DIRS ${OPTIX_INCLUDE_DIR})
@@ -45,6 +57,7 @@ ENDIF()
 
 MARK_AS_ADVANCED(
   OPTIX_INCLUDE_DIR
+  OPTIX_VERSION
 )
 
 UNSET(_optix_SEARCH_DIRS)
diff --git a/build_files/config/pipeline_config.yaml b/build_files/config/pipeline_config.yaml
index 5d1a24a30f1..8222f2ff0b9 100644
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -55,7 +55,7 @@ buildbot:
     cuda11:
         version: '11.4.1'
     optix:
-        version: '7.1.0'
+        version: '7.3.0'
     cmake:
         default:
             version: any
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h
index 0fa0f1291fa..a2142b8f2ba 100644
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -645,7 +645,8 @@ typedef enum CUdevice_P2PAttribute_enum {
   CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01,
   CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02,
   CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03,
-  CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED = 0x04,
+  CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04,
+  CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04,
 } CUdevice_P2PAttribute;
 
 typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void* userData);
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 381248e9bf1..17096d441f0 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -247,7 +247,7 @@ if(WITH_CYCLES_OSL)
 endif()
 
 if(WITH_CYCLES_DEVICE_OPTIX)
-  find_package(OptiX)
+  find_package(OptiX 7.3.0)
 
   if(OPTIX_FOUND)
     add_definitions(-DWITH_OPTIX)
@@ -286,11 +286,17 @@ if(WITH_OPENSUBDIV)
   )
 endif()
 
+if(WITH_OPENIMAGEDENOISE)
+  add_definitions(-DWITH_OPENIMAGEDENOISE)
+  add_definitions(-DOIDN_STATIC_LIB)
+  include_directories(
+    SYSTEM
+    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+  )
+endif()
+
 if(WITH_CYCLES_STANDALONE)
-  set(WITH_CYCLES_DEVICE_OPENCL TRUE)
   set(WITH_CYCLES_DEVICE_CUDA TRUE)
-  # Experimental and unfinished.
-  set(WITH_CYCLES_NETWORK FALSE)
 endif()
 # TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
@@ -386,18 +392,12 @@ if(WITH_CYCLES_BLENDER)
   add_subdirectory(blender)
 endif()
 
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-
-if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
-  add_subdirectory(app)
-endif()
-
+add_subdirectory(app)
 add_subdirectory(bvh)
 add_subdirectory(device)
 add_subdirectory(doc)
 add_subdirectory(graph)
+add_subdirectory(integrator)
 add_subdirectory(kernel)
 add_subdirectory(render)
 add_subdirectory(subd)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 7a1e5d62dd2..f9dc5f00802 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -91,24 +91,6 @@ if(WITH_CYCLES_STANDALONE)
 endif()
 
 #####################################################################
-# Cycles network server executable
-#####################################################################
-
-if(WITH_CYCLES_NETWORK)
-  set(SRC
-    cycles_server.cpp
-  )
-  add_executable(cycles_server ${SRC})
-  target_link_libraries(cycles_server ${LIBRARIES})
-  cycles_target_link_libraries(cycles_server)
-
-  if(UNIX AND NOT APPLE)
-    set_target_properties(cycles_server PROPERTIES INSTALL_RPATH $ORIGIN/lib)
-  endif()
-  unset(SRC)
-endif()
-
-#####################################################################
 # Cycles cubin compiler executable
 #####################################################################
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 6b3513b065a..270096d70b0 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -126,7 +126,7 @@ static BufferParams &session_buffer_params()
 
 static void scene_init()
 {
-  options.scene = new Scene(options.scene_params, options.session->device);
+  options.scene = options.session->scene;
 
   /* Read XML */
   xml_read_file(options.scene, options.filepath.c_str());
@@ -148,7 +148,7 @@ static void scene_init()
 static void session_init()
 {
   options.session_params.write_render_cb = write_render;
-  options.session = new Session(options.session_params);
+  options.session = new Session(options.session_params, options.scene_params);
 
   if (options.session_params.background && !options.quiet)
     options.session->progress.set_update_callback(function_bind(&session_print_status));
@@ -159,7 +159,6 @@ static void session_init()
 
   /* load scene */
   scene_init();
-  options.session->scene = options.scene;
 
   options.session->reset(session_buffer_params(), options.session_params.samples);
   options.session->start();
@@ -527,9 +526,6 @@ static void options_parse(int argc, const char **argv)
     fprintf(stderr, "No file path specified\n");
     exit(EXIT_FAILURE);
   }
-
-  /* For smoother Viewport */
-  options.session_params.start_resolution = 64;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 276d850f1b3..54f97fddbd9 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -703,7 +703,7 @@ void xml_read_file(Scene *scene, const char *filepath)
 
   xml_read_include(state, path_filename(filepath));
 
-  scene->params.bvh_type = SceneParams::BVH_STATIC;
+  scene->params.bvh_type = BVH_TYPE_STATIC;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index ee5c6157338..5bdcfd56a4d 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -33,6 +33,7 @@ set(SRC
   blender_device.cpp
   blender_image.cpp
   blender_geometry.cpp
+  blender_gpu_display.cpp
   blender_light.cpp
   blender_mesh.cpp
   blender_object.cpp
@@ -50,6 +51,7 @@ set(SRC
 
   CCL_api.h
   blender_device.h
+  blender_gpu_display.h
   blender_id_map.h
   blender_image.h
   blender_object_cull.h
@@ -93,14 +95,6 @@ set(ADDON_FILES
 
 add_definitions(${GL_DEFINITIONS})
 
-if(WITH_CYCLES_DEVICE_OPENCL)
-  add_definitions(-DWITH_OPENCL)
-endif()
-
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-
 if(WITH_MOD_FLUID)
   add_definitions(-DWITH_FLUID)
 endif()
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index f728050a3cf..1ce25a253f9 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -58,7 +58,6 @@ class CyclesRender(bpy.types.RenderEngine):
     bl_use_eevee_viewport = True
     bl_use_preview = True
     bl_use_exclude_layers = True
-    bl_use_save_buffers = True
     bl_use_spherical_stereo = True
     bl_use_custom_freestyle = True
     bl_use_alembic_procedural = True
@@ -85,6 +84,12 @@ class CyclesRender(bpy.types.RenderEngine):
     def render(self, depsgraph):
         engine.render(self, depsgraph)
 
+    def render_frame_finish(self):
+        engine.render_frame_finish(self)
+
+    def draw(self, context, depsgraph):
+        engine.draw(self, depsgraph, context.space_data)
+
     def bake(self, depsgraph, obj, pass_type, pass_filter, width, height):
         engine.bake(self, depsgraph, obj, pass_type, pass_filter, width, height)
 
@@ -98,7 +103,7 @@ class CyclesRender(bpy.types.RenderEngine):
         engine.sync(self, depsgraph, context.blend_data)
 
     def view_draw(self, context, depsgraph):
-        engine.draw(self, depsgraph, context.region, context.space_data, context.region_data)
+        engine.view_draw(self, depsgraph, context.region, context.space_data, context.region_data)
 
     def update_script_node(self, node):
         if engine.with_osl():
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 489a883f098..e0e8ca10bef 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -18,62 +18,17 @@
 from __future__ import annotations
 
 
-def _is_using_buggy_driver():
-    import gpu
-    # We need to be conservative here because in multi-GPU systems display card
-    # might be quite old, but others one might be just good.
-    #
-    # So We shouldn't disable possible good dedicated cards just because display
-    # card seems weak. And instead we only blacklist configurations which are
-    # proven to cause problems.
-    if gpu.platform.vendor_get() == "ATI Technologies Inc.":
-        import re
-        version = gpu.platform.version_get()
-        if version.endswith("Compatibility Profile Context"):
-            # Old HD 4xxx and 5xxx series drivers did not have driver version
-            # in the version string, but those cards do not quite work and
-            # causing crashes.
-            return True
-        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\\.[0-9]+)+)$")
-        if not regex.match(version):
-            # Skip cards like FireGL
-            return False
-        version = regex.sub("\\1", version).split('.')
-        return int(version[0]) == 8
-    return False
-
-
-def _workaround_buggy_drivers():
-    if _is_using_buggy_driver():
-        import _cycles
-        if hasattr(_cycles, "opencl_disable"):
-            print("Cycles: OpenGL driver known to be buggy, disabling OpenCL platform.")
-            _cycles.opencl_disable()
-
-
 def _configure_argument_parser():
     import argparse
     # No help because it conflicts with general Python scripts argument parsing
     parser = argparse.ArgumentParser(description="Cycles Addon argument parser",
                                      add_help=False)
-    parser.add_argument("--cycles-resumable-num-chunks",
-                        help="Number of chunks to split sample range into",
-                        default=None)
-    parser.add_argument("--cycles-resumable-current-chunk",
-                        help="Current chunk of samples range to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-start-chunk",
-                        help="Start chunk to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-end-chunk",
-                        help="End chunk to render",
-                        default=None)
     parser.add_argument("--cycles-print-stats",
                         help="Print rendering statistics to stderr",
                         action='store_true')
     parser.add_argument("--cycles-device",
                         help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA', 'OPTIX' or 'OPENCL'."
+                             "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
                              "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                         default=None)
     return parser
@@ -89,21 +44,6 @@ def _parse_command_line():
     parser = _configure_argument_parser()
     args, _ = parser.parse_known_args(argv[argv.index("--") + 1:])
 
-    if args.cycles_resumable_num_chunks is not None:
-        if args.cycles_resumable_current_chunk is not None:
-            import _cycles
-            _cycles.set_resumable_chunk(
-                int(args.cycles_resumable_num_chunks),
-                int(args.cycles_resumable_current_chunk),
-            )
-        elif args.cycles_resumable_start_chunk is not None and \
-                args.cycles_resumable_end_chunk:
-            import _cycles
-            _cycles.set_resumable_chunk_range(
-                int(args.cycles_resumable_num_chunks),
-                int(args.cycles_resumable_start_chunk),
-                int(args.cycles_resumable_end_chunk),
-            )
     if args.cycles_print_stats:
         import _cycles
         _cycles.enable_print_stats()
@@ -118,23 +58,11 @@ def init():
     import _cycles
     import os.path
 
-    # Workaround possibly buggy legacy drivers which crashes on the OpenCL
-    # device enumeration.
-    #
-    # This checks are not really correct because they might still fail
-    # in the case of multiple GPUs. However, currently buggy drivers
-    # are really old and likely to be used in single GPU systems only
-    # anyway.
-    #
-    # Can't do it in the background mode, so we hope OpenCL is no enabled
-    # in the user preferences.
-    if not bpy.app.background:
-        _workaround_buggy_drivers()
-
     path = os.path.dirname(__file__)
     user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', path='')))
+    temp_path = bpy.app.tempdir
 
-    _cycles.init(path, user_path, bpy.app.background)
+    _cycles.init(path, user_path, temp_path, bpy.app.background)
     _parse_command_line()
 
 
@@ -177,6 +105,25 @@ def render(engine, depsgraph):
         _cycles.render(engine.session, depsgraph.as_pointer())
 
 
+def render_frame_finish(engine):
+    if not engine.session:
+        return
+
+    import _cycles
+    _cycles.render_frame_finish(engine.session)
+
+def draw(engine, depsgraph, space_image):
+    if not engine.session:
+        return
+
+    depsgraph_ptr = depsgraph.as_pointer()
+    space_image_ptr = space_image.as_pointer()
+    screen_ptr = space_image.id_data.as_pointer()
+
+    import _cycles
+    _cycles.draw(engine.session, depsgraph_ptr, screen_ptr, space_image_ptr)
+
+
 def bake(engine, depsgraph, obj, pass_type, pass_filter, width, height):
     import _cycles
     session = getattr(engine, "session", None)
@@ -204,14 +151,14 @@ def sync(engine, depsgraph, data):
     _cycles.sync(engine.session, depsgraph.as_pointer())
 
 
-def draw(engine, depsgraph, region, v3d, rv3d):
+def view_draw(engine, depsgraph, region, v3d, rv3d):
     import _cycles
     depsgraph = depsgraph.as_pointer()
     v3d = v3d.as_pointer()
     rv3d = rv3d.as_pointer()
 
     # draw render image
-    _cycles.draw(engine.session, depsgraph, v3d, rv3d)
+    _cycles.view_draw(engine.session, depsgraph, v3d, rv3d)
 
 
 def available_devices():
@@ -224,11 +171,6 @@ def with_osl():
     return _cycles.with_osl
 
 
-def with_network():
-    import _cycles
-    return _cycles.with_network
-
-
 def system_info():
     import _cycles
     return _cycles.system_info()
@@ -243,6 +185,7 @@ def list_render_passes(scene, srl):
     # Data passes.
     if srl.use_pass_z:                     yield ("Depth",         "Z",    'VALUE')
     if srl.use_pass_mist:                  yield ("Mist",          "Z",    'VALUE')
+    if srl.use_pass_position:              yield ("Position",      "XYZ",  'VECTOR')
     if srl.use_pass_normal:                yield ("Normal",        "XYZ",  'VECTOR')
     if srl.use_pass_vector:                yield ("Vector",        "XYZW", 'VECTOR')
     if srl.use_pass_uv:                    yield ("UV",            "UVA",  'VECTOR')
@@ -265,6 +208,7 @@ def list_render_passes(scene, srl):
     if srl.use_pass_environment:           yield ("Env",           "RGB",  'COLOR')
     if srl.use_pass_shadow:                yield ("Shadow",        "RGB",  'COLOR')
     if srl.use_pass_ambient_occlusion:     yield ("AO",            "RGB",  'COLOR')
+    if crl.use_pass_shadow_catcher:        yield ("Shadow Catcher",      "RGB",  'COLOR')
 
     # Debug passes.
     if crl.pass_debug_render_time:             yield ("Debug Render Time",             "X",   'VALUE')
@@ -283,30 +227,20 @@ def list_render_passes(scene, srl):
             yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
 
     # Denoising passes.
-    if (scene.cycles.use_denoising and crl.use_denoising) or crl.denoising_store_passes:
+    if scene.cycles.use_denoising and crl.use_denoising:
         yield ("Noisy Image", "RGBA", 'COLOR')
-        if crl.denoising_store_passes:
-            yield ("Denoising Normal",          "XYZ", 'VECTOR')
-            yield ("Denoising Albedo",          "RGB", 'COLOR')
-            yield ("Denoising Depth",           "Z",   'VALUE')
-
-            if scene.cycles.denoiser == 'NLM':
-                yield ("Denoising Shadowing",       "X",   'VALUE')
-                yield ("Denoising Variance",        "RGB", 'COLOR')
-                yield ("Denoising Intensity",       "X",   'VALUE')
-
-                clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
-                                 "denoising_glossy_direct", "denoising_glossy_indirect",
-                                 "denoising_transmission_direct", "denoising_transmission_indirect")
-                if any(getattr(crl, option) for option in clean_options):
-                    yield ("Denoising Clean", "RGB", 'COLOR')
+        if crl.use_pass_shadow_catcher:
+            yield ("Noisy Shadow Catcher", "RGBA", 'COLOR')
+    if crl.denoising_store_passes:
+        yield ("Denoising Normal",          "XYZ", 'VECTOR')
+        yield ("Denoising Albedo",          "RGB", 'COLOR')
 
     # Custom AOV passes.
     for aov in srl.aovs:
         if aov.type == 'VALUE':
             yield (aov.name, "X", 'VALUE')
         else:
-            yield (aov.name, "RGBA", 'COLOR')
+            yield (aov.name, "RGB", 'COLOR')
 
 
 def register_passes(engine, scene, view_layer):
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index bf33e5dc010..37c39904e30 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -60,32 +60,48 @@ class AddPresetSampling(AddPresetBase, Operator):
     ]
 
     preset_values = [
+        "cycles.use_adaptive_sampling",
         "cycles.samples",
-        "cycles.preview_samples",
-        "cycles.aa_samples",
-        "cycles.preview_aa_samples",
-        "cycles.diffuse_samples",
-        "cycles.glossy_samples",
-        "cycles.transmission_samples",
-        "cycles.ao_samples",
-        "cycles.mesh_light_samples",
-        "cycles.subsurface_samples",
-        "cycles.volume_samples",
-        "cycles.use_square_samples",
-        "cycles.progressive",
-        "cycles.seed",
-        "cycles.sample_clamp_direct",
-        "cycles.sample_clamp_indirect",
-        "cycles.sample_all_lights_direct",
-        "cycles.sample_all_lights_indirect",
+        "cycles.adaptive_threshold",
+        "cycles.adaptive_min_samples",
+        "cycles.time_limit",
+        "cycles.use_denoising",
+        "cycles.denoiser",
+        "cycles.denoising_input_passes",
+        "cycles.denoising_prefilter",
     ]
 
     preset_subdir = "cycles/sampling"
 
 
+class AddPresetViewportSampling(AddPresetBase, Operator):
+    '''Add a Viewport Sampling Preset'''
+    bl_idname = "render.cycles_viewport_sampling_preset_add"
+    bl_label = "Add Viewport Sampling Preset"
+    preset_menu = "CYCLES_PT_viewport_sampling_presets"
+
+    preset_defines = [
+        "cycles = bpy.context.scene.cycles"
+    ]
+
+    preset_values = [
+        "cycles.use_preview_adaptive_sampling",
+        "cycles.preview_samples",
+        "cycles.preview_adaptive_threshold",
+        "cycles.preview_adaptive_min_samples",
+        "cycles.use_preview_denoising",
+        "cycles.preview_denoiser",
+        "cycles.preview_denoising_input_passes",
+        "cycles.preview_denoising_prefilter",
+        "cycles.preview_denoising_start_sample",
+    ]
+
+    preset_subdir = "cycles/viewport_sampling"
+
 classes = (
     AddPresetIntegrator,
     AddPresetSampling,
+    AddPresetViewportSampling,
 )
 
 
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0c3af3fabeb..c2570e71efd 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -39,11 +39,6 @@ enum_devices = (
     ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in the system tab in the user preferences"),
 )
 
-from _cycles import with_network
-if with_network:
-    enum_devices += (('NETWORK', "Networked Device", "Use networked device for rendering"),)
-del with_network
-
 enum_feature_set = (
     ('SUPPORTED', "Supported", "Only use finished and supported features"),
     ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1),
@@ -84,15 +79,6 @@ enum_curve_shape = (
     ('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"),
 )
 
-enum_tile_order = (
-    ('CENTER', "Center", "Render from center to the edges"),
-    ('RIGHT_TO_LEFT', "Right to Left", "Render from right to left"),
-    ('LEFT_TO_RIGHT', "Left to Right", "Render from left to right"),
-    ('TOP_TO_BOTTOM', "Top to Bottom", "Render from top to bottom"),
-    ('BOTTOM_TO_TOP', "Bottom to Top", "Render from bottom to top"),
-    ('HILBERT_SPIRAL', "Hilbert Spiral", "Render in a Hilbert Spiral"),
-)
-
 enum_use_layer_samples = (
     ('USE', "Use", "Per render layer number of samples override scene samples"),
     ('BOUNDED', "Bounded", "Bound per render layer number of samples by global samples"),
@@ -101,15 +87,9 @@ enum_use_layer_samples = (
 
 enum_sampling_pattern = (
     ('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
-    ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
     ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"),
 )
 
-enum_integrator = (
-    ('BRANCHED_PATH', "Branched Path Tracing", "Path tracing integrator that branches on the first bounce, giving more control over the number of light and material samples"),
-    ('PATH', "Path Tracing", "Pure path tracing integrator"),
-)
-
 enum_volume_sampling = (
     ('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"),
     ('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"),
@@ -131,7 +111,6 @@ enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
     ('CUDA', "CUDA", "CUDA", 1),
     ('OPTIX', "OptiX", "OptiX", 3),
-    ('OPENCL', "OpenCL", "OpenCL", 2)
 )
 
 enum_texture_limit = (
@@ -144,39 +123,46 @@ enum_texture_limit = (
     ('4096', "4096", "Limit texture size to 4096 pixels", 6),
     ('8192', "8192", "Limit texture size to 8192 pixels", 7),
 )
-
+ 
+# NOTE: Identifiers are expected to be an upper case version of identifiers from  `Pass::get_type_enum()`
 enum_view3d_shading_render_pass = (
     ('', "General", ""),
 
-    ('COMBINED', "Combined", "Show the Combined Render pass", 1),
-    ('EMISSION', "Emission", "Show the Emission render pass", 33),
-    ('BACKGROUND', "Background", "Show the Background render pass", 34),
-    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass", 35),
+    ('COMBINED', "Combined", "Show the Combined Render pass"),
+    ('EMISSION', "Emission", "Show the Emission render pass"),
+    ('BACKGROUND', "Background", "Show the Background render pass"),
+    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass"),
+    ('SHADOW', "Shadow", "Show the Shadow render pass"),
+    ('SHADOW_CATCHER', "Shadow Catcher", "Show the Shadow Catcher render pass"),
 
     ('', "Light", ""),
 
-    ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass", 38),
-    ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass", 39),
-    ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass", 40),
+    ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass"),
+    ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass"),
+    ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass"),
 
-    ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass", 41),
-    ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass", 42),
-    ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass", 43),
+    ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass"),
+    ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass"),
+    ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass"),
 
     ('', "", ""),
 
-    ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass", 44),
-    ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass", 45),
-    ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass", 46),
+    ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass"),
+    ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass"),
+    ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass"),
 
-    ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass", 50),
-    ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass", 51),
+    ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass"),
+    ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass"),
 
     ('', "Data", ""),
 
-    ('NORMAL', "Normal", "Show the Normal render pass", 3),
-    ('UV', "UV", "Show the UV render pass", 4),
-    ('MIST', "Mist", "Show the Mist render pass", 32),
+    ('POSITION', "Position", "Show the Position render pass"),
+    ('NORMAL', "Normal", "Show the Normal render pass"),
+    ('UV', "UV", "Show the UV render pass"),
+    ('MIST', "Mist", "Show the Mist render pass"),
+    ('DENOISING_ALBEDO', "Denoising Albedo", "Albedo pass used by denoiser"),
+    ('DENOISING_NORMAL', "Denoising Normal", "Normal pass used by denoiser"),
+    ('SAMPLE_COUNT', "Sample Count", "Per-pixel number of samples"),
 )
 
 
@@ -208,18 +194,23 @@ def enum_preview_denoiser(self, context):
 
 
 def enum_denoiser(self, context):
-    items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)]
+    items = []
     items += enum_optix_denoiser(self, context)
     items += enum_openimagedenoise_denoiser(self, context)
     return items
 
 
 enum_denoising_input_passes = (
-    ('RGB', "Color", "Use only color as input", 1),
-    ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2),
-    ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3),
+    ('RGB', "None", "Don't use utility passes for denoising", 1),
+    ('RGB_ALBEDO', "Albedo", "Use albedo pass for denoising", 2),
+    ('RGB_ALBEDO_NORMAL', "Albedo and Normal", "Use albedo and normal passes for denoising", 3),
 )
 
+enum_denoising_prefilter = (
+    ('NONE', "None", "No prefiltering, use when guiding passes are noise-free", 1),
+    ('FAST', "Fast", "Denoise color and guiding passes together. Improves quality when guiding passes are noisy using least amount of extra processing time", 2),
+    ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
+)
 
 def update_render_passes(self, context):
     scene = context.scene
@@ -252,13 +243,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         description="Use Open Shading Language (CPU rendering only)",
     )
 
-    progressive: EnumProperty(
-        name="Integrator",
-        description="Method to sample lights and materials",
-        items=enum_integrator,
-        default='PATH',
-    )
-
     preview_pause: BoolProperty(
         name="Pause Preview",
         description="Pause all viewport preview renders",
@@ -268,110 +252,88 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
-        default=False,
+        default=True,
         update=update_render_passes,
     )
-    use_preview_denoising: BoolProperty(
-        name="Use Viewport Denoising",
-        description="Denoise the image in the 3D viewport",
-        default=False,
-    )
-
     denoiser: EnumProperty(
         name="Denoiser",
         description="Denoise the image with the selected denoiser. "
-        "For denoising the image after rendering, denoising data render passes "
-        "also adapt to the selected denoiser",
+        "For denoising the image after rendering",
         items=enum_denoiser,
-        default=1,
+        default=4, # Use integer to avoid error in builds without OpenImageDenoise.
         update=update_render_passes,
     )
+    denoising_prefilter: EnumProperty(
+        name="Denoising Prefilter",
+        description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+        items=enum_denoising_prefilter,
+        default='ACCURATE',
+    )
+    denoising_input_passes: EnumProperty(
+        name="Denoising Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO_NORMAL',
+    )
+
+    use_preview_denoising: BoolProperty(
+        name="Use Viewport Denoising",
+        description="Denoise the image in the 3D viewport",
+        default=False,
+    )
     preview_denoiser: EnumProperty(
         name="Viewport Denoiser",
         description="Denoise the image after each preview update with the selected denoiser",
         items=enum_preview_denoiser,
         default=0,
     )
-
-    use_square_samples: BoolProperty(
-        name="Square Samples",
-        description="Square sampling values for easier artist control",
-        default=False,
+    preview_denoising_prefilter: EnumProperty(
+        name="Viewport Denoising Prefilter",
+        description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+        items=enum_denoising_prefilter,
+        default='FAST',
+    )
+    preview_denoising_input_passes: EnumProperty(
+        name="Viewport Denoising Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO',
+    )
+    preview_denoising_start_sample: IntProperty(
+        name="Start Denoising",
+        description="Sample to start denoising the preview at",
+        min=0, max=(1 << 24),
+        default=1,
     )
 
     samples: IntProperty(
         name="Samples",
         description="Number of samples to render for each pixel",
         min=1, max=(1 << 24),
-        default=128,
+        default=4096,
     )
     preview_samples: IntProperty(
         name="Viewport Samples",
         description="Number of samples to render in the viewport, unlimited if 0",
         min=0, max=(1 << 24),
-        default=32,
-    )
-    aa_samples: IntProperty(
-        name="AA Samples",
-        description="Number of antialiasing samples to render for each pixel",
-        min=1, max=2097151,
-        default=128,
-    )
-    preview_aa_samples: IntProperty(
-        name="AA Samples",
-        description="Number of antialiasing samples to render in the viewport, unlimited if 0",
-        min=0, max=2097151,
-        default=32,
+        default=1024,
     )
 
-    diffuse_samples: IntProperty(
-        name="Diffuse Samples",
-        description="Number of diffuse bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    glossy_samples: IntProperty(
-        name="Glossy Samples",
-        description="Number of glossy bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    transmission_samples: IntProperty(
-        name="Transmission Samples",
-        description="Number of transmission bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    ao_samples: IntProperty(
-        name="Ambient Occlusion Samples",
-        description="Number of ambient occlusion samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    mesh_light_samples: IntProperty(
-        name="Mesh Light Samples",
-        description="Number of mesh emission light samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    subsurface_samples: IntProperty(
-        name="Subsurface Samples",
-        description="Number of subsurface scattering samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    volume_samples: IntProperty(
-        name="Volume Samples",
-        description="Number of volume scattering samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
+    time_limit: FloatProperty(
+        name="Time Limit",
+        description="Limit the render time (excluding synchronization time)."
+        "Zero disables the limit",
+        min=0.0,
+        default=0.0,
+        step=100.0,
+        unit='TIME_ABSOLUTE',
     )
 
     sampling_pattern: EnumProperty(
         name="Sampling Pattern",
         description="Random sampling pattern used by the integrator",
         items=enum_sampling_pattern,
-        default='SOBOL',
+        default='PROGRESSIVE_MUTI_JITTER',
     )
 
     use_layer_samples: EnumProperty(
@@ -381,17 +343,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default='USE',
     )
 
-    sample_all_lights_direct: BoolProperty(
-        name="Sample All Direct Lights",
-        description="Sample all lights (for direct samples), rather than randomly picking one",
-        default=True,
-    )
-
-    sample_all_lights_indirect: BoolProperty(
-        name="Sample All Indirect Lights",
-        description="Sample all lights (for indirect samples), rather than randomly picking one",
-        default=True,
-    )
     light_sampling_threshold: FloatProperty(
         name="Light Sampling Threshold",
         description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
@@ -403,19 +354,39 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     use_adaptive_sampling: BoolProperty(
         name="Use Adaptive Sampling",
         description="Automatically reduce the number of samples per pixel based on estimated noise level",
-        default=False,
+        default=True,
     )
-
     adaptive_threshold: FloatProperty(
         name="Adaptive Sampling Threshold",
         description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples",
         min=0.0, max=1.0,
-        default=0.0,
+        soft_min=0.001,
+        default=0.01,
         precision=4,
     )
     adaptive_min_samples: IntProperty(
         name="Adaptive Min Samples",
-        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on number of AA samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold",
+        min=0, max=4096,
+        default=0,
+    )
+
+    use_preview_adaptive_sampling: BoolProperty(
+        name="Use Adaptive Sampling",
+        description="Automatically reduce the number of samples per pixel based on estimated noise level, for viewport renders",
+        default=True,
+    )
+    preview_adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples, for viewport renders",
+        min=0.0, max=1.0,
+        soft_min=0.001,
+        default=0.1,
+        precision=4,
+    )
+    preview_adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold, for viewport renders",
         min=0, max=4096,
         default=0,
     )
@@ -632,53 +603,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=10.0,
     )
 
-    debug_tile_size: IntProperty(
-        name="Tile Size",
-        description="",
-        min=1, max=4096,
-        default=1024,
-    )
-
-    preview_start_resolution: IntProperty(
-        name="Start Resolution",
-        description="Resolution to start rendering preview at, "
-        "progressively increasing it to the full viewport size",
-        min=8, max=16384,
-        default=64,
-        subtype='PIXEL'
-    )
-    preview_denoising_start_sample: IntProperty(
-        name="Start Denoising",
-        description="Sample to start denoising the preview at",
-        min=0, max=(1 << 24),
-        default=1,
-    )
-    preview_denoising_input_passes: EnumProperty(
-        name="Viewport Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO',
-    )
-
-    debug_reset_timeout: FloatProperty(
-        name="Reset timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=0.1,
-    )
-    debug_cancel_timeout: FloatProperty(
-        name="Cancel timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=0.1,
-    )
-    debug_text_timeout: FloatProperty(
-        name="Text timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=1.0,
-    )
-
     debug_bvh_type: EnumProperty(
         name="Viewport BVH Type",
         description="Choose between faster updates, or faster render",
@@ -701,38 +625,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0,
         min=0, max=16,
     )
-    tile_order: EnumProperty(
-        name="Tile Order",
-        description="Tile order for rendering",
-        items=enum_tile_order,
-        default='HILBERT_SPIRAL',
-        options=set(),  # Not animatable!
-    )
-    use_progressive_refine: BoolProperty(
-        name="Progressive Refine",
-        description="Instead of rendering each tile until it is finished, "
-        "refine the whole image progressively "
-        "(this renders somewhat slower, "
-        "but time can be saved by manually stopping the render when the noise is low enough)",
-        default=False,
-    )
 
     bake_type: EnumProperty(
         name="Bake Type",
         default='COMBINED',
         description="Type of pass to bake",
         items=(
-            ('COMBINED', "Combined", ""),
-            ('AO', "Ambient Occlusion", ""),
-            ('SHADOW', "Shadow", ""),
-            ('NORMAL', "Normal", ""),
-            ('UV', "UV", ""),
-            ('ROUGHNESS', "Roughness", ""),
-            ('EMIT', "Emit", ""),
-            ('ENVIRONMENT', "Environment", ""),
-            ('DIFFUSE', "Diffuse", ""),
-            ('GLOSSY', "Glossy", ""),
-            ('TRANSMISSION', "Transmission", ""),
+            ('COMBINED', "Combined", "", 0),
+            ('AO', "Ambient Occlusion", "", 1),
+            ('SHADOW', "Shadow", "", 2),
+            ('POSITION', "Position", "", 11),
+            ('NORMAL', "Normal", "", 3),
+            ('UV', "UV", "", 4),
+            ('ROUGHNESS', "Roughness", "", 5),
+            ('EMIT', "Emit", "", 6),
+            ('ENVIRONMENT', "Environment", "", 7),
+            ('DIFFUSE', "Diffuse", "", 8),
+            ('GLOSSY', "Glossy", "", 9),
+            ('TRANSMISSION', "Transmission", "", 10),
         ),
     )
 
@@ -827,6 +737,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=0, max=1024,
     )
 
+    use_auto_tile: BoolProperty(
+        name="Auto Tiles",
+        description="Automatically split image into tiles",
+        default=True,
+    )
+    tile_size: IntProperty(
+        name="Tile Size",
+        default=2048,
+        description="",
+        min=0, max=16384,
+    )
+
     # Various fine-tuning debug flags
 
     def _devices_update_callback(self, context):
@@ -844,45 +766,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         items=enum_bvh_layouts,
         default='EMBREE',
     )
-    debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
     debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False)
-    debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
-
-    debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
-    debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)
-
-    debug_opencl_kernel_type: EnumProperty(
-        name="OpenCL Kernel Type",
-        default='DEFAULT',
-        items=(
-            ('DEFAULT', "Default", ""),
-            ('MEGA', "Mega", ""),
-            ('SPLIT', "Split", ""),
-        ),
-        update=CyclesRenderSettings._devices_update_callback
-    )
 
-    debug_opencl_device_type: EnumProperty(
-        name="OpenCL Device Type",
-        default='ALL',
-        items=(
-            ('NONE', "None", ""),
-            ('ALL', "All", ""),
-            ('DEFAULT', "Default", ""),
-            ('CPU', "CPU", ""),
-            ('GPU', "GPU", ""),
-            ('ACCELERATOR', "Accelerator", ""),
-        ),
-        update=CyclesRenderSettings._devices_update_callback
-    )
-
-    debug_use_opencl_debug: BoolProperty(name="Debug OpenCL", default=False)
-
-    debug_opencl_mem_limit: IntProperty(
-        name="Memory limit",
-        default=0,
-        description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)"
+    debug_use_optix_debug: BoolProperty(
+        name="OptiX Module Debug",
+        description="Load OptiX module in debug mode: lower logging verbosity level, enable validations, and lower optimization level",
+        default=False
     )
 
     @classmethod
@@ -1031,12 +921,6 @@ class CyclesLightSettings(bpy.types.PropertyGroup):
         description="Light casts shadows",
         default=True,
     )
-    samples: IntProperty(
-        name="Samples",
-        description="Number of light samples to render for each AA sample",
-        min=1, max=10000,
-        default=1,
-    )
     max_bounces: IntProperty(
         name="Max Bounces",
         description="Maximum number of bounces the light will contribute to the render",
@@ -1084,12 +968,6 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
         min=4, max=8192,
         default=1024,
     )
-    samples: IntProperty(
-        name="Samples",
-        description="Number of light samples to render for each AA sample",
-        min=1, max=10000,
-        default=1,
-    )
     max_bounces: IntProperty(
         name="Max Bounces",
         description="Maximum number of bounces the background light will contribute to the render",
@@ -1343,91 +1221,25 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         update=update_render_passes,
     )
 
+    use_pass_shadow_catcher: BoolProperty(
+        name="Shadow Catcher",
+        description="Pass containing shadows and light which is to be multiplied into backdrop",
+        default=False,
+        update=update_render_passes,
+    )
+
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
         default=True,
         update=update_render_passes,
     )
-    denoising_diffuse_direct: BoolProperty(
-        name="Diffuse Direct",
-        description="Denoise the direct diffuse lighting",
-        default=True,
-    )
-    denoising_diffuse_indirect: BoolProperty(
-        name="Diffuse Indirect",
-        description="Denoise the indirect diffuse lighting",
-        default=True,
-    )
-    denoising_glossy_direct: BoolProperty(
-        name="Glossy Direct",
-        description="Denoise the direct glossy lighting",
-        default=True,
-    )
-    denoising_glossy_indirect: BoolProperty(
-        name="Glossy Indirect",
-        description="Denoise the indirect glossy lighting",
-        default=True,
-    )
-    denoising_transmission_direct: BoolProperty(
-        name="Transmission Direct",
-        description="Denoise the direct transmission lighting",
-        default=True,
-    )
-    denoising_transmission_indirect: BoolProperty(
-        name="Transmission Indirect",
-        description="Denoise the indirect transmission lighting",
-        default=True,
-    )
-    denoising_strength: FloatProperty(
-        name="Denoising Strength",
-        description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
-        min=0.0, max=1.0,
-        default=0.5,
-    )
-    denoising_feature_strength: FloatProperty(
-        name="Denoising Feature Strength",
-        description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
-        min=0.0, max=1.0,
-        default=0.5,
-    )
-    denoising_radius: IntProperty(
-        name="Denoising Radius",
-        description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
-        min=1, max=25,
-        default=8,
-        subtype="PIXEL",
-    )
-    denoising_relative_pca: BoolProperty(
-        name="Relative Filter",
-        description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
-        default=False,
-    )
     denoising_store_passes: BoolProperty(
         name="Store Denoising Passes",
         description="Store the denoising feature passes and the noisy image. The passes adapt to the denoiser selected for rendering",
         default=False,
         update=update_render_passes,
     )
-    denoising_neighbor_frames: IntProperty(
-        name="Neighbor Frames",
-        description="Number of neighboring frames to use for denoising animations (more frames produce smoother results at the cost of performance)",
-        min=0, max=7,
-        default=0,
-    )
-
-    denoising_optix_input_passes: EnumProperty(
-        name="Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO',
-    )
-    denoising_openimagedenoise_input_passes: EnumProperty(
-        name="Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO_NORMAL',
-    )
 
     @classmethod
     def register(cls):
@@ -1454,14 +1266,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_optix, has_opencl = _cycles.get_device_types()
+        has_cuda, has_optix = _cycles.get_device_types()
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
         if has_optix:
             list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
-        if has_opencl:
-            list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
         return list
 
     compute_device_type: EnumProperty(
@@ -1486,7 +1296,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1520,22 +1330,23 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             elif entry.type == 'CPU':
                 cpu_devices.append(entry)
         # Extend all GPU devices with CPU.
-        if compute_device_type in {'CUDA', 'OPTIX', 'OPENCL'}:
+        if compute_device_type != 'CPU':
             devices.extend(cpu_devices)
         return devices
 
-    # For backwards compatibility, only returns CUDA and OpenCL but still
-    # refreshes all devices.
-    def get_devices(self, compute_device_type=''):
+    # Refresh device list. This does not happen automatically on Blender
+    # startup due to unstable OpenCL implementations that can cause crashes.
+    def refresh_devices(self):
         import _cycles
         # Ensure `self.devices` is not re-allocated when the second call to
         # get_devices_for_type is made, freeing items from the first list.
         for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
             self.update_device_entries(_cycles.available_devices(device_type))
 
-        cuda_devices = self.get_devices_for_type('CUDA')
-        opencl_devices = self.get_devices_for_type('OPENCL')
-        return cuda_devices, opencl_devices
+    # Deprecated: use refresh_devices instead.
+    def get_devices(self, compute_device_type=''):
+        self.refresh_devices()
+        return None
 
     def get_num_gpu_devices(self):
         import _cycles
@@ -1601,6 +1412,10 @@ class CyclesView3DShadingSettings(bpy.types.PropertyGroup):
         items=enum_view3d_shading_render_pass,
         default='COMBINED',
     )
+    show_active_pixels: BoolProperty(
+        name="Show Active Pixels",
+        description="When using adaptive sampling highlight pixels which are being sampled",
+    )
 
 
 def register():
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 47f7b4c6d73..d02627b9936 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -34,6 +34,12 @@ class CYCLES_PT_sampling_presets(PresetPanel, Panel):
     preset_add_operator = "render.cycles_sampling_preset_add"
     COMPAT_ENGINES = {'CYCLES'}
 
+class CYCLES_PT_viewport_sampling_presets(PresetPanel, Panel):
+    bl_label = "Viewport Sampling Presets"
+    preset_subdir = "cycles/viewport_sampling"
+    preset_operator = "script.execute_preset"
+    preset_add_operator = "render.cycles_viewport_sampling_preset_add"
+    COMPAT_ENGINES = {'CYCLES'}
 
 class CYCLES_PT_integrator_presets(PresetPanel, Panel):
     bl_label = "Integrator Presets"
@@ -54,6 +60,15 @@ class CyclesButtonsPanel:
         return context.engine in cls.COMPAT_ENGINES
 
 
+class CyclesDebugButtonsPanel(CyclesButtonsPanel):
+    @classmethod
+    def poll(cls, context):
+        prefs = bpy.context.preferences
+        return (CyclesButtonsPanel.poll(context)
+                and prefs.experimental.use_cycles_debug
+                and prefs.view.show_developer_ui)
+
+
 # Adapt properties editor panel to display in node editor. We have to
 # copy the class rather than inherit due to the way bpy registration works.
 def node_panel(cls):
@@ -78,12 +93,6 @@ def use_cpu(context):
     return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')
 
 
-def use_opencl(context):
-    cscene = context.scene.cycles
-
-    return (get_device_type(context) == 'OPENCL' and cscene.device == 'GPU')
-
-
 def use_cuda(context):
     cscene = context.scene.cycles
 
@@ -96,12 +105,6 @@ def use_optix(context):
     return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
 
 
-def use_branched_path(context):
-    cscene = context.scene.cycles
-
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context))
-
-
 def use_sample_all_lights(context):
     cscene = context.scene.cycles
 
@@ -115,57 +118,33 @@ def show_device_active(context):
     return context.preferences.addons[__package__].preferences.has_active_device()
 
 
-def draw_samples_info(layout, context):
-    cscene = context.scene.cycles
-    integrator = cscene.progressive
+def get_effective_preview_denoiser(context):
+    scene = context.scene
+    cscene = scene.cycles
+
+    if cscene.preview_denoiser != "AUTO":
+        return cscene.preview_denoiser
+
+    if context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX'):
+        return 'OPTIX'
+
+    return 'OIDN'
 
-    # Calculate sample values
-    if integrator == 'PATH':
-        aa = cscene.samples
-        if cscene.use_square_samples:
-            aa = aa * aa
-    else:
-        aa = cscene.aa_samples
-        d = cscene.diffuse_samples
-        g = cscene.glossy_samples
-        t = cscene.transmission_samples
-        ao = cscene.ao_samples
-        ml = cscene.mesh_light_samples
-        sss = cscene.subsurface_samples
-        vol = cscene.volume_samples
-
-        if cscene.use_square_samples:
-            aa = aa * aa
-            d = d * d
-            g = g * g
-            t = t * t
-            ao = ao * ao
-            ml = ml * ml
-            sss = sss * sss
-            vol = vol * vol
-
-    # Draw interface
-    # Do not draw for progressive, when Square Samples are disabled
-    if use_branched_path(context) or (cscene.use_square_samples and integrator == 'PATH'):
-        col = layout.column(align=True)
-        col.scale_y = 0.6
-        col.label(text="Total Samples:")
-        col.separator()
-        if integrator == 'PATH':
-            col.label(text="%s AA" % aa)
-        else:
-            col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
-                      (aa, d * aa, g * aa, t * aa))
-            col.separator()
-            col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
-                      (ao * aa, ml * aa, sss * aa, vol * aa))
 
 
 class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
     bl_label = "Sampling"
 
+    def draw(self, context):
+        pass
+
+
+class CYCLES_RENDER_PT_sampling_viewport(CyclesButtonsPanel, Panel):
+    bl_label = "Viewport"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+
     def draw_header_preset(self, context):
-        CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
+        CYCLES_PT_viewport_sampling_presets.draw_panel_header(self.layout)
 
     def draw(self, context):
         layout = self.layout
@@ -176,29 +155,31 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
-        if not use_optix(context):
-            layout.prop(cscene, "progressive")
+        heading = layout.column(align=True, heading="Noise Threshold")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_preview_adaptive_sampling", text="")
+        sub = row.row()
+        sub.active = cscene.use_preview_adaptive_sampling
+        sub.prop(cscene, "preview_adaptive_threshold", text="")
 
-        if not use_branched_path(context):
+        if cscene.use_preview_adaptive_sampling:
             col = layout.column(align=True)
-            col.prop(cscene, "samples", text="Render")
-            col.prop(cscene, "preview_samples", text="Viewport")
+            col.prop(cscene, "preview_samples", text=" Max Samples")
+            col.prop(cscene, "preview_adaptive_min_samples", text="Min Samples")
         else:
-            col = layout.column(align=True)
-            col.prop(cscene, "aa_samples", text="Render")
-            col.prop(cscene, "preview_aa_samples", text="Viewport")
+            layout.prop(cscene, "preview_samples", text="Samples")
 
-        if not use_branched_path(context):
-            draw_samples_info(layout, context)
 
+class CYCLES_RENDER_PT_sampling_viewport_denoise(CyclesButtonsPanel, Panel):
+    bl_label = "Denoise"
+    bl_parent_id = 'CYCLES_RENDER_PT_sampling_viewport'
+    bl_options = {'DEFAULT_CLOSED'}
 
-class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
-    bl_label = "Sub Samples"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    def draw_header(self, context):
+        scene = context.scene
+        cscene = scene.cycles
 
-    @classmethod
-    def poll(cls, context):
-        return use_branched_path(context)
+        self.layout.prop(context.scene.cycles, "use_preview_denoising", text="")
 
     def draw(self, context):
         layout = self.layout
@@ -208,53 +189,61 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        col = layout.column(align=True)
-        col.prop(cscene, "diffuse_samples", text="Diffuse")
-        col.prop(cscene, "glossy_samples", text="Glossy")
-        col.prop(cscene, "transmission_samples", text="Transmission")
-        col.prop(cscene, "ao_samples", text="AO")
+        col = layout.column()
+        col.active = cscene.use_preview_denoising
+        col.prop(cscene, "preview_denoiser", text="Denoiser")
+        col.prop(cscene, "preview_denoising_input_passes", text="Passes")
 
-        sub = col.row(align=True)
-        sub.active = use_sample_all_lights(context)
-        sub.prop(cscene, "mesh_light_samples", text="Mesh Light")
-        col.prop(cscene, "subsurface_samples", text="Subsurface")
-        col.prop(cscene, "volume_samples", text="Volume")
+        effective_preview_denoiser = get_effective_preview_denoiser(context)
+        if effective_preview_denoiser == 'OPENIMAGEDENOISE':
+            col.prop(cscene, "preview_denoising_prefilter", text="Prefilter")
 
-        draw_samples_info(layout, context)
+        col.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
 
 
-class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel):
-    bl_label = "Adaptive Sampling"
+class CYCLES_RENDER_PT_sampling_render(CyclesButtonsPanel, Panel):
+    bl_label = "Render"
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
-    bl_options = {'DEFAULT_CLOSED'}
 
-    def draw_header(self, context):
-        layout = self.layout
-        scene = context.scene
-        cscene = scene.cycles
-
-        layout.prop(cscene, "use_adaptive_sampling", text="")
+    def draw_header_preset(self, context):
+        CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
 
     def draw(self, context):
         layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
 
         scene = context.scene
         cscene = scene.cycles
 
-        layout.active = cscene.use_adaptive_sampling
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        heading = layout.column(align=True, heading="Noise Threshold")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_adaptive_sampling", text="")
+        sub = row.row()
+        sub.active = cscene.use_adaptive_sampling
+        sub.prop(cscene, "adaptive_threshold", text="")
 
         col = layout.column(align=True)
-        col.prop(cscene, "adaptive_threshold", text="Noise Threshold")
-        col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+        if cscene.use_adaptive_sampling:
+            col.prop(cscene, "samples", text=" Max Samples")
+            col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+        else:
+            col.prop(cscene, "samples", text="Samples")
+        col.prop(cscene, "time_limit")
 
 
-class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
-    bl_label = "Denoising"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+class CYCLES_RENDER_PT_sampling_render_denoise(CyclesButtonsPanel, Panel):
+    bl_label = "Denoise"
+    bl_parent_id = 'CYCLES_RENDER_PT_sampling_render'
     bl_options = {'DEFAULT_CLOSED'}
 
+    def draw_header(self, context):
+        scene = context.scene
+        cscene = scene.cycles
+
+        self.layout.prop(context.scene.cycles, "use_denoising", text="")
+
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -263,33 +252,12 @@ class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        heading = layout.column(align=True, heading="Render")
-        row = heading.row(align=True)
-        row.prop(cscene, "use_denoising", text="")
-        sub = row.row()
-
-        sub.active = cscene.use_denoising
-        for view_layer in scene.view_layers:
-            if view_layer.cycles.denoising_store_passes:
-                sub.active = True
-
-        sub.prop(cscene, "denoiser", text="")
-
-        layout.separator()
-
-        heading = layout.column(align=False, heading="Viewport")
-        row = heading.row(align=True)
-        row.prop(cscene, "use_preview_denoising", text="")
-        sub = row.row()
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoiser", text="")
-
-        sub = heading.row(align=True)
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
-        sub = heading.row(align=True)
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoising_input_passes", text="Input Passes")
+        col = layout.column()
+        col.active = cscene.use_denoising
+        col.prop(cscene, "denoiser", text="Denoiser")
+        col.prop(cscene, "denoising_input_passes", text="Passes")
+        if cscene.denoiser == 'OPENIMAGEDENOISE':
+            col.prop(cscene, "denoising_prefilter", text="Prefilter")
 
 
 class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
@@ -313,8 +281,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         col.active = not(cscene.use_adaptive_sampling)
         col.prop(cscene, "sampling_pattern", text="Pattern")
 
-        layout.prop(cscene, "use_square_samples")
-
         layout.separator()
 
         col = layout.column(align=True)
@@ -322,11 +288,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         col.prop(cscene, "min_transparent_bounces")
         col.prop(cscene, "light_sampling_threshold", text="Light Threshold")
 
-        if cscene.progressive != 'PATH' and use_branched_path(context):
-            col = layout.column(align=True)
-            col.prop(cscene, "sample_all_lights_direct")
-            col.prop(cscene, "sample_all_lights_indirect")
-
         for view_layer in scene.view_layers:
             if view_layer.samples > 0:
                 layout.separator()
@@ -334,62 +295,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
                 break
 
 
-class CYCLES_RENDER_PT_sampling_total(CyclesButtonsPanel, Panel):
-    bl_label = "Total Samples"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
-
-    @classmethod
-    def poll(cls, context):
-        scene = context.scene
-        cscene = scene.cycles
-
-        if cscene.use_square_samples:
-            return True
-
-        return cscene.progressive != 'PATH' and use_branched_path(context)
-
-    def draw(self, context):
-        layout = self.layout
-        cscene = context.scene.cycles
-        integrator = cscene.progressive
-
-        # Calculate sample values
-        if integrator == 'PATH':
-            aa = cscene.samples
-            if cscene.use_square_samples:
-                aa = aa * aa
-        else:
-            aa = cscene.aa_samples
-            d = cscene.diffuse_samples
-            g = cscene.glossy_samples
-            t = cscene.transmission_samples
-            ao = cscene.ao_samples
-            ml = cscene.mesh_light_samples
-            sss = cscene.subsurface_samples
-            vol = cscene.volume_samples
-
-            if cscene.use_square_samples:
-                aa = aa * aa
-                d = d * d
-                g = g * g
-                t = t * t
-                ao = ao * ao
-                ml = ml * ml
-                sss = sss * sss
-                vol = vol * vol
-
-        col = layout.column(align=True)
-        col.scale_y = 0.6
-        if integrator == 'PATH':
-            col.label(text="%s AA" % aa)
-        else:
-            col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
-                      (aa, d * aa, g * aa, t * aa))
-            col.separator()
-            col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
-                      (ao * aa, ml * aa, sss * aa, vol * aa))
-
-
 class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
     bl_label = "Subdivision"
     bl_options = {'DEFAULT_CLOSED'}
@@ -548,6 +453,8 @@ class CYCLES_RENDER_PT_light_paths_fast_gi(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
+        layout.active = cscene.use_fast_gi
+
         col = layout.column(align=True)
         col.prop(cscene, "ao_bounces", text="Viewport Bounces")
         col.prop(cscene, "ao_bounces_render", text="Render Bounces")
@@ -716,19 +623,13 @@ class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
         layout.use_property_decorate = False
 
         scene = context.scene
-        rd = scene.render
         cscene = scene.cycles
 
         col = layout.column()
-
-        sub = col.column(align=True)
-        sub.prop(rd, "tile_x", text="Tiles X")
-        sub.prop(rd, "tile_y", text="Y")
-        col.prop(cscene, "tile_order", text="Order")
-
+        col.prop(cscene, "use_auto_tile")
         sub = col.column()
-        sub.active = not rd.use_save_buffers and not cscene.use_adaptive_sampling
-        sub.prop(cscene, "use_progressive_refine")
+        sub.active = cscene.use_auto_tile
+        sub.prop(cscene, "tile_size")
 
 
 class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Panel):
@@ -778,7 +679,6 @@ class CYCLES_RENDER_PT_performance_final_render(CyclesButtonsPanel, Panel):
 
         col = layout.column()
 
-        col.prop(rd, "use_save_buffers")
         col.prop(rd, "use_persistent_data", text="Persistent Data")
 
 
@@ -797,7 +697,6 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
 
         col = layout.column()
         col.prop(rd, "preview_pixel_size", text="Pixel Size")
-        col.prop(cscene, "preview_start_resolution", text="Start Pixels")
 
 
 class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
@@ -818,7 +717,6 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
 
         col = layout.column(heading="Include")
         col.prop(view_layer, "use_sky", text="Environment")
-        col.prop(view_layer, "use_ao", text="Ambient Occlusion")
         col.prop(view_layer, "use_solid", text="Surfaces")
         col.prop(view_layer, "use_strand", text="Hair")
         col.prop(view_layer, "use_volumes", text="Volumes")
@@ -827,6 +725,9 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
         sub = col.row()
         sub.prop(view_layer, "use_motion_blur", text="Motion Blur")
         sub.active = rd.use_motion_blur
+        sub = col.row()
+        sub.prop(view_layer.cycles, 'use_denoising', text='Denoising')
+        sub.active = scene.cycles.use_denoising
 
 
 class CYCLES_RENDER_PT_override(CyclesButtonsPanel, Panel):
@@ -872,6 +773,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
         col.prop(view_layer, "use_pass_combined")
         col.prop(view_layer, "use_pass_z")
         col.prop(view_layer, "use_pass_mist")
+        col.prop(view_layer, "use_pass_position")
         col.prop(view_layer, "use_pass_normal")
         sub = col.column()
         sub.active = not rd.use_motion_blur
@@ -928,6 +830,7 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
         col.prop(view_layer, "use_pass_environment")
         col.prop(view_layer, "use_pass_shadow")
         col.prop(view_layer, "use_pass_ambient_occlusion", text="Ambient Occlusion")
+        col.prop(cycles_view_layer, "use_pass_shadow_catcher")
 
 
 class CYCLES_RENDER_PT_passes_crypto(CyclesButtonsPanel, ViewLayerCryptomattePanel, Panel):
@@ -942,70 +845,6 @@ class CYCLES_RENDER_PT_passes_aov(CyclesButtonsPanel, ViewLayerAOVPanel):
     bl_parent_id = "CYCLES_RENDER_PT_passes"
 
 
-class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
-    bl_label = "Denoising"
-    bl_context = "view_layer"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    @classmethod
-    def poll(cls, context):
-        cscene = context.scene.cycles
-        return CyclesButtonsPanel.poll(context) and cscene.use_denoising
-
-    def draw_header(self, context):
-        scene = context.scene
-        view_layer = context.view_layer
-        cycles_view_layer = view_layer.cycles
-
-        layout = self.layout
-        layout.prop(cycles_view_layer, "use_denoising", text="")
-
-    def draw(self, context):
-        layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
-
-        scene = context.scene
-        view_layer = context.view_layer
-        cycles_view_layer = view_layer.cycles
-        denoiser = scene.cycles.denoiser
-
-        layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising
-
-        col = layout.column()
-
-        if denoiser == 'OPTIX':
-            col.prop(cycles_view_layer, "denoising_optix_input_passes")
-            return
-        elif denoiser == 'OPENIMAGEDENOISE':
-            col.prop(cycles_view_layer, "denoising_openimagedenoise_input_passes")
-            return
-
-        col.prop(cycles_view_layer, "denoising_radius", text="Radius")
-
-        col = layout.column()
-        col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
-        col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
-        col.prop(cycles_view_layer, "denoising_relative_pca")
-
-        layout.separator()
-
-        col = layout.column()
-        col.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
-
-        row = col.row(heading="Diffuse", align=True)
-        row.prop(cycles_view_layer, "denoising_diffuse_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_diffuse_indirect", text="Indirect", toggle=True)
-
-        row = col.row(heading="Glossy", align=True)
-        row.prop(cycles_view_layer, "denoising_glossy_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_glossy_indirect", text="Indirect", toggle=True)
-
-        row = col.row(heading="Transmission", align=True)
-        row.prop(cycles_view_layer, "denoising_transmission_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_transmission_indirect", text="Indirect", toggle=True)
-
-
 class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1417,10 +1256,6 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel):
 
         if not (light.type == 'AREA' and clamp.is_portal):
             sub = col.column()
-            if use_branched_path(context):
-                subsub = sub.row(align=True)
-                subsub.active = use_sample_all_lights(context)
-                subsub.prop(clamp, "samples")
             sub.prop(clamp, "max_bounces")
 
         sub = col.column(align=True)
@@ -1526,34 +1361,6 @@ class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
 
 
-class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
-    bl_label = "Ambient Occlusion"
-    bl_context = "world"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    @classmethod
-    def poll(cls, context):
-        return context.world and CyclesButtonsPanel.poll(context)
-
-    def draw_header(self, context):
-        light = context.world.light_settings
-        self.layout.prop(light, "use_ambient_occlusion", text="")
-
-    def draw(self, context):
-        layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
-
-        light = context.world.light_settings
-        scene = context.scene
-
-        col = layout.column()
-        sub = col.column()
-        sub.active = light.use_ambient_occlusion or scene.render.use_simplify
-        sub.prop(light, "ao_factor", text="Factor")
-        col.prop(light, "distance", text="Distance")
-
-
 class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel):
     bl_label = "Mist Pass"
     bl_context = "world"
@@ -1650,10 +1457,6 @@ class CYCLES_WORLD_PT_settings_surface(CyclesButtonsPanel, Panel):
         subsub = sub.row(align=True)
         subsub.active = cworld.sampling_method == 'MANUAL'
         subsub.prop(cworld, "sample_map_resolution")
-        if use_branched_path(context):
-            subsub = sub.column(align=True)
-            subsub.active = use_sample_all_lights(context)
-            subsub.prop(cworld, "samples")
         sub.prop(cworld, "max_bounces")
 
 
@@ -1677,8 +1480,7 @@ class CYCLES_WORLD_PT_settings_volume(CyclesButtonsPanel, Panel):
         col = layout.column()
 
         sub = col.column()
-        sub.active = use_cpu(context)
-        sub.prop(cworld, "volume_sampling", text="Sampling")
+        col.prop(cworld, "volume_sampling", text="Sampling")
         col.prop(cworld, "volume_interpolation", text="Interpolation")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
         sub = col.column()
@@ -1817,8 +1619,7 @@ class CYCLES_MATERIAL_PT_settings_volume(CyclesButtonsPanel, Panel):
 
         col = layout.column()
         sub = col.column()
-        sub.active = use_cpu(context)
-        sub.prop(cmat, "volume_sampling", text="Sampling")
+        col.prop(cmat, "volume_sampling", text="Sampling")
         col.prop(cmat, "volume_interpolation", text="Interpolation")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
         sub = col.column()
@@ -1845,9 +1646,6 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
         cbk = scene.render.bake
         rd = scene.render
 
-        if use_optix(context):
-            layout.label(text="Baking is performed using CUDA instead of OptiX", icon='INFO')
-
         if rd.use_bake_multires:
             layout.operator("object.bake_image", icon='RENDER_STILL')
             layout.prop(rd, "use_bake_multires")
@@ -1905,7 +1703,6 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
             col.prop(cbk, "use_pass_diffuse")
             col.prop(cbk, "use_pass_glossy")
             col.prop(cbk, "use_pass_transmission")
-            col.prop(cbk, "use_pass_ambient_occlusion")
             col.prop(cbk, "use_pass_emit")
 
         elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
@@ -1989,19 +1786,12 @@ class CYCLES_RENDER_PT_bake_output(CyclesButtonsPanel, Panel):
                 layout.prop(cbk, "use_clear", text="Clear Image")
 
 
-class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
     bl_label = "Debug"
     bl_context = "render"
     bl_options = {'DEFAULT_CLOSED'}
     COMPAT_ENGINES = {'CYCLES'}
 
-    @classmethod
-    def poll(cls, context):
-        prefs = bpy.context.preferences
-        return (CyclesButtonsPanel.poll(context)
-                and prefs.experimental.use_cycles_debug
-                and prefs.view.show_developer_ui)
-
     def draw(self, context):
         layout = self.layout
 
@@ -2018,29 +1808,18 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
         row.prop(cscene, "debug_use_cpu_avx", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
         col.prop(cscene, "debug_bvh_layout")
-        col.prop(cscene, "debug_use_cpu_split_kernel")
 
         col.separator()
 
         col = layout.column()
         col.label(text="CUDA Flags:")
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
-        col.prop(cscene, "debug_use_cuda_split_kernel")
 
         col.separator()
 
         col = layout.column()
         col.label(text="OptiX Flags:")
-        col.prop(cscene, "debug_optix_cuda_streams")
-        col.prop(cscene, "debug_optix_curves_api")
-
-        col.separator()
-
-        col = layout.column()
-        col.label(text="OpenCL Flags:")
-        col.prop(cscene, "debug_opencl_device_type", text="Device")
-        col.prop(cscene, "debug_use_opencl_debug", text="Debug")
-        col.prop(cscene, "debug_opencl_mem_limit")
+        col.prop(cscene, "debug_use_optix_debug")
 
         col.separator()
 
@@ -2141,20 +1920,22 @@ class CYCLES_RENDER_PT_simplify_culling(CyclesButtonsPanel, Panel):
         sub.prop(cscene, "distance_cull_margin", text="")
 
 
-class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
+class CyclesShadingButtonsPanel(CyclesButtonsPanel):
     bl_space_type = 'VIEW_3D'
     bl_region_type = 'HEADER'
-    bl_label = "Render Pass"
     bl_parent_id = 'VIEW3D_PT_shading'
-    COMPAT_ENGINES = {'CYCLES'}
 
     @classmethod
     def poll(cls, context):
         return (
-            context.engine in cls.COMPAT_ENGINES and
+            CyclesButtonsPanel.poll(context) and
             context.space_data.shading.type == 'RENDERED'
         )
 
+
+class CYCLES_VIEW3D_PT_shading_render_pass(CyclesShadingButtonsPanel, Panel):
+    bl_label = "Render Pass"
+
     def draw(self, context):
         shading = context.space_data.shading
 
@@ -2162,6 +1943,26 @@ class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
         layout.prop(shading.cycles, "render_pass", text="")
 
 
+class CYCLES_VIEW3D_PT_shading_debug(CyclesDebugButtonsPanel,
+                                     CyclesShadingButtonsPanel,
+                                     Panel):
+    bl_label = "Debug"
+
+    @classmethod
+    def poll(cls, context):
+        return (
+            CyclesDebugButtonsPanel.poll(context) and
+            CyclesShadingButtonsPanel.poll(context)
+        )
+
+    def draw(self, context):
+        shading = context.space_data.shading
+
+        layout = self.layout
+        layout.active = context.scene.cycles.use_preview_adaptive_sampling
+        layout.prop(shading.cycles, "show_active_pixels")
+
+
 class CYCLES_VIEW3D_PT_shading_lighting(Panel):
     bl_space_type = 'VIEW_3D'
     bl_region_type = 'HEADER'
@@ -2275,11 +2076,13 @@ def get_panels():
 
 classes = (
     CYCLES_PT_sampling_presets,
+    CYCLES_PT_viewport_sampling_presets,
     CYCLES_PT_integrator_presets,
     CYCLES_RENDER_PT_sampling,
-    CYCLES_RENDER_PT_sampling_sub_samples,
-    CYCLES_RENDER_PT_sampling_adaptive,
-    CYCLES_RENDER_PT_sampling_denoising,
+    CYCLES_RENDER_PT_sampling_viewport,
+    CYCLES_RENDER_PT_sampling_viewport_denoise,
+    CYCLES_RENDER_PT_sampling_render,
+    CYCLES_RENDER_PT_sampling_render_denoise,
     CYCLES_RENDER_PT_sampling_advanced,
     CYCLES_RENDER_PT_light_paths,
     CYCLES_RENDER_PT_light_paths_max_bounces,
@@ -2296,6 +2099,7 @@ classes = (
     CYCLES_VIEW3D_PT_simplify_greasepencil,
     CYCLES_VIEW3D_PT_shading_lighting,
     CYCLES_VIEW3D_PT_shading_render_pass,
+    CYCLES_VIEW3D_PT_shading_debug,
     CYCLES_RENDER_PT_motion_blur,
     CYCLES_RENDER_PT_motion_blur_curve,
     CYCLES_RENDER_PT_film,
@@ -2314,7 +2118,6 @@ classes = (
     CYCLES_RENDER_PT_passes_aov,
     CYCLES_RENDER_PT_filter,
     CYCLES_RENDER_PT_override,
-    CYCLES_RENDER_PT_denoising,
     CYCLES_PT_post_processing,
     CYCLES_CAMERA_PT_dof,
     CYCLES_CAMERA_PT_dof_aperture,
@@ -2333,7 +2136,6 @@ classes = (
     CYCLES_WORLD_PT_preview,
     CYCLES_WORLD_PT_surface,
     CYCLES_WORLD_PT_volume,
-    CYCLES_WORLD_PT_ambient_occlusion,
     CYCLES_WORLD_PT_mist,
     CYCLES_WORLD_PT_ray_visibility,
     CYCLES_WORLD_PT_settings,
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 827f84b9873..57da7d7995c 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -109,7 +109,7 @@ def do_versions(self):
         library_versions.setdefault(library.version, []).append(library)
 
     # Do versioning per library, since they might have different versions.
-    max_need_versioning = (2, 93, 7)
+    max_need_versioning = (3, 0, 25)
     for version, libraries in library_versions.items():
         if version > max_need_versioning:
             continue
@@ -166,10 +166,6 @@ def do_versions(self):
                 if not cscene.is_property_set("filter_type"):
                     cscene.pixel_filter_type = 'GAUSSIAN'
 
-                # Tile Order
-                if not cscene.is_property_set("tile_order"):
-                    cscene.tile_order = 'CENTER'
-
             if version <= (2, 76, 10):
                 cscene = scene.cycles
                 if cscene.is_property_set("filter_type"):
@@ -186,10 +182,6 @@ def do_versions(self):
             if version <= (2, 79, 0):
                 cscene = scene.cycles
                 # Default changes
-                if not cscene.is_property_set("aa_samples"):
-                    cscene.aa_samples = 4
-                if not cscene.is_property_set("preview_aa_samples"):
-                    cscene.preview_aa_samples = 4
                 if not cscene.is_property_set("blur_glossy"):
                     cscene.blur_glossy = 0.0
                 if not cscene.is_property_set("sample_clamp_indirect"):
@@ -203,7 +195,6 @@ def do_versions(self):
                     view_layer.use_pass_cryptomatte_material = cview_layer.get("use_pass_crypto_material", False)
                     view_layer.use_pass_cryptomatte_asset = cview_layer.get("use_pass_crypto_asset", False)
                     view_layer.pass_cryptomatte_depth = cview_layer.get("pass_crypto_depth", 6)
-                    view_layer.use_pass_cryptomatte_accurate = cview_layer.get("pass_crypto_accurate", True)
 
             if version <= (2, 93, 7):
                 if scene.render.engine == 'CYCLES':
@@ -229,6 +220,35 @@ def do_versions(self):
                     cscene.ao_bounces = 1
                     cscene.ao_bounces_render = 1
 
+            if version <= (3, 0, 25):
+                cscene = scene.cycles
+
+                # Default changes.
+                if not cscene.is_property_set("samples"):
+                    cscene.samples = 128
+                if not cscene.is_property_set("preview_samples"):
+                    cscene.preview_samples = 32
+                if not cscene.is_property_set("use_adaptive_sampling"):
+                    cscene.use_adaptive_sampling = False
+                    cscene.use_preview_adaptive_sampling = False
+                if not cscene.is_property_set("use_denoising"):
+                    cscene.use_denoising = False
+                if not cscene.is_property_set("use_preview_denoising"):
+                    cscene.use_preview_denoising = False
+                if not cscene.is_property_set("sampling_pattern"):
+                    cscene.sampling_pattern = 'PROGRESSIVE_MUTI_JITTER'
+
+                # Removal of square samples.
+                cscene = scene.cycles
+                use_square_samples = cscene.get("use_square_samples", False)
+
+                if use_square_samples:
+                    cscene.samples *= cscene.samples
+                    cscene.preview_samples *= cscene.preview_samples
+                    for layer in scene.view_layers:
+                        layer.samples *= layer.samples
+                    cscene["use_square_samples"] = False
+
         # Lamps
         for light in bpy.data.lights:
             if light.library not in libraries:
@@ -249,10 +269,6 @@ def do_versions(self):
             if version <= (2, 76, 9):
                 cworld = world.cycles
 
-                # World MIS Samples
-                if not cworld.is_property_set("samples"):
-                    cworld.samples = 4
-
                 # World MIS Resolution
                 if not cworld.is_property_set("sample_map_resolution"):
                     cworld.sample_map_resolution = 256
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 6954c5c2f26..4e8df5a99a6 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -894,12 +894,8 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d,
   }
 }
 
-BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
-                                            BL::RegionView3D &b_rv3d,
-                                            Camera *cam,
-                                            int width,
-                                            int height,
-                                            const bool use_denoiser)
+BufferParams BlenderSync::get_buffer_params(
+    BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height)
 {
   BufferParams params;
   bool use_border = false;
@@ -931,11 +927,6 @@ BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
     params.height = height;
   }
 
-  PassType display_pass = update_viewport_display_passes(b_v3d, params.passes);
-
-  /* Can only denoise the combined image pass */
-  params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser;
-
   return params;
 }
 
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index d51b31de638..ce1770f18a3 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -25,7 +25,6 @@ CCL_NAMESPACE_BEGIN
 enum ComputeDevice {
   COMPUTE_DEVICE_CPU = 0,
   COMPUTE_DEVICE_CUDA = 1,
-  COMPUTE_DEVICE_OPENCL = 2,
   COMPUTE_DEVICE_OPTIX = 3,
 
   COMPUTE_DEVICE_NUM
@@ -68,13 +67,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       device = Device::get_multi_device(devices, threads, background);
     }
   }
-  else if (get_enum(cscene, "device") == 2) {
-    /* Find network device. */
-    vector<DeviceInfo> devices = Device::available_devices(DEVICE_MASK_NETWORK);
-    if (!devices.empty()) {
-      device = devices.front();
-    }
-  }
   else if (get_enum(cscene, "device") == 1) {
     /* Test if we are using GPU devices. */
     ComputeDevice compute_device = (ComputeDevice)get_enum(
@@ -89,9 +81,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       else if (compute_device == COMPUTE_DEVICE_OPTIX) {
         mask |= DEVICE_MASK_OPTIX;
       }
-      else if (compute_device == COMPUTE_DEVICE_OPENCL) {
-        mask |= DEVICE_MASK_OPENCL;
-      }
       vector<DeviceInfo> devices = Device::available_devices(mask);
 
       /* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_gpu_display.cpp
new file mode 100644
index 00000000000..a79232af71f
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.cpp
@@ -0,0 +1,761 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blender/blender_gpu_display.h"
+
+#include "device/device.h"
+#include "util/util_logging.h"
+#include "util/util_opengl.h"
+
+extern "C" {
+struct RenderEngine;
+
+bool RE_engine_has_render_context(struct RenderEngine *engine);
+void RE_engine_render_context_enable(struct RenderEngine *engine);
+void RE_engine_render_context_disable(struct RenderEngine *engine);
+
+bool DRW_opengl_context_release();
+void DRW_opengl_context_activate(bool drw_state);
+
+void *WM_opengl_context_create();
+void WM_opengl_context_activate(void *gl_context);
+void WM_opengl_context_dispose(void *gl_context);
+void WM_opengl_context_release(void *context);
+}
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * BlenderDisplayShader.
+ */
+
+unique_ptr<BlenderDisplayShader> BlenderDisplayShader::create(BL::RenderEngine &b_engine,
+                                                              BL::Scene &b_scene)
+{
+  if (b_engine.support_display_space_shader(b_scene)) {
+    return make_unique<BlenderDisplaySpaceShader>(b_engine, b_scene);
+  }
+
+  return make_unique<BlenderFallbackDisplayShader>();
+}
+
+int BlenderDisplayShader::get_position_attrib_location()
+{
+  if (position_attribute_location_ == -1) {
+    const uint shader_program = get_shader_program();
+    position_attribute_location_ = glGetAttribLocation(shader_program, position_attribute_name);
+  }
+  return position_attribute_location_;
+}
+
+int BlenderDisplayShader::get_tex_coord_attrib_location()
+{
+  if (tex_coord_attribute_location_ == -1) {
+    const uint shader_program = get_shader_program();
+    tex_coord_attribute_location_ = glGetAttribLocation(shader_program, tex_coord_attribute_name);
+  }
+  return tex_coord_attribute_location_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderFallbackDisplayShader.
+ */
+
+/* TODO move shaders to standalone .glsl file. */
+static const char *FALLBACK_VERTEX_SHADER =
+    "#version 330\n"
+    "uniform vec2 fullscreen;\n"
+    "in vec2 texCoord;\n"
+    "in vec2 pos;\n"
+    "out vec2 texCoord_interp;\n"
+    "\n"
+    "vec2 normalize_coordinates()\n"
+    "{\n"
+    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
+    "}\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
+    "   texCoord_interp = texCoord;\n"
+    "}\n\0";
+
+static const char *FALLBACK_FRAGMENT_SHADER =
+    "#version 330\n"
+    "uniform sampler2D image_texture;\n"
+    "in vec2 texCoord_interp;\n"
+    "out vec4 fragColor;\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   fragColor = texture(image_texture, texCoord_interp);\n"
+    "}\n\0";
+
+static void shader_print_errors(const char *task, const char *log, const char *code)
+{
+  LOG(ERROR) << "Shader: " << task << " error:";
+  LOG(ERROR) << "===== shader string ====";
+
+  stringstream stream(code);
+  string partial;
+
+  int line = 1;
+  while (getline(stream, partial, '\n')) {
+    if (line < 10) {
+      LOG(ERROR) << " " << line << " " << partial;
+    }
+    else {
+      LOG(ERROR) << line << " " << partial;
+    }
+    line++;
+  }
+  LOG(ERROR) << log;
+}
+
+static int compile_fallback_shader(void)
+{
+  const struct Shader {
+    const char *source;
+    const GLenum type;
+  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
+                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
+
+  const GLuint program = glCreateProgram();
+
+  for (int i = 0; i < 2; i++) {
+    const GLuint shader = glCreateShader(shaders[i].type);
+
+    string source_str = shaders[i].source;
+    const char *c_str = source_str.c_str();
+
+    glShaderSource(shader, 1, &c_str, NULL);
+    glCompileShader(shader);
+
+    GLint compile_status;
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status);
+
+    if (!compile_status) {
+      GLchar log[5000];
+      GLsizei length = 0;
+      glGetShaderInfoLog(shader, sizeof(log), &length, log);
+      shader_print_errors("compile", log, c_str);
+      return 0;
+    }
+
+    glAttachShader(program, shader);
+  }
+
+  /* Link output. */
+  glBindFragDataLocation(program, 0, "fragColor");
+
+  /* Link and error check. */
+  glLinkProgram(program);
+
+  /* TODO(sergey): Find a way to nicely de-duplicate the error checking. */
+  GLint link_status;
+  glGetProgramiv(program, GL_LINK_STATUS, &link_status);
+  if (!link_status) {
+    GLchar log[5000];
+    GLsizei length = 0;
+    /* TODO(sergey): Is it really program passed to glGetShaderInfoLog? */
+    glGetShaderInfoLog(program, sizeof(log), &length, log);
+    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
+    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
+    return 0;
+  }
+
+  return program;
+}
+
+void BlenderFallbackDisplayShader::bind(int width, int height)
+{
+  create_shader_if_needed();
+
+  if (!shader_program_) {
+    return;
+  }
+
+  glUseProgram(shader_program_);
+  glUniform1i(image_texture_location_, 0);
+  glUniform2f(fullscreen_location_, width, height);
+}
+
+void BlenderFallbackDisplayShader::unbind()
+{
+}
+
+uint BlenderFallbackDisplayShader::get_shader_program()
+{
+  return shader_program_;
+}
+
+void BlenderFallbackDisplayShader::create_shader_if_needed()
+{
+  if (shader_program_ || shader_compile_attempted_) {
+    return;
+  }
+
+  shader_compile_attempted_ = true;
+
+  shader_program_ = compile_fallback_shader();
+  if (!shader_program_) {
+    return;
+  }
+
+  glUseProgram(shader_program_);
+
+  image_texture_location_ = glGetUniformLocation(shader_program_, "image_texture");
+  if (image_texture_location_ < 0) {
+    LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
+    destroy_shader();
+    return;
+  }
+
+  fullscreen_location_ = glGetUniformLocation(shader_program_, "fullscreen");
+  if (fullscreen_location_ < 0) {
+    LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
+    destroy_shader();
+    return;
+  }
+}
+
+void BlenderFallbackDisplayShader::destroy_shader()
+{
+  glDeleteProgram(shader_program_);
+  shader_program_ = 0;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderDisplaySpaceShader.
+ */
+
+BlenderDisplaySpaceShader::BlenderDisplaySpaceShader(BL::RenderEngine &b_engine,
+                                                     BL::Scene &b_scene)
+    : b_engine_(b_engine), b_scene_(b_scene)
+{
+  DCHECK(b_engine_.support_display_space_shader(b_scene_));
+}
+
+void BlenderDisplaySpaceShader::bind(int /*width*/, int /*height*/)
+{
+  b_engine_.bind_display_space_shader(b_scene_);
+}
+
+void BlenderDisplaySpaceShader::unbind()
+{
+  b_engine_.unbind_display_space_shader();
+}
+
+uint BlenderDisplaySpaceShader::get_shader_program()
+{
+  if (!shader_program_) {
+    glGetIntegerv(GL_CURRENT_PROGRAM, reinterpret_cast<int *>(&shader_program_));
+  }
+
+  if (!shader_program_) {
+    LOG(ERROR) << "Error retrieving shader program for display space shader.";
+  }
+
+  return shader_program_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderGPUDisplay.
+ */
+
+BlenderGPUDisplay::BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene)
+    : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene))
+{
+  /* Create context while on the main thread. */
+  gl_context_create();
+}
+
+BlenderGPUDisplay::~BlenderGPUDisplay()
+{
+  gl_resources_destroy();
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams &params,
+                                        int texture_width,
+                                        int texture_height)
+{
+  /* Note that it's the responsibility of BlenderGPUDisplay to ensure updating and drawing
+   * the texture does not happen at the same time. This is achieved indirectly.
+   *
+   * When enabling the OpenGL context, it uses an internal mutex lock DST.gl_context_lock.
+   * This same lock is also held when do_draw() is called, which together ensure mutual
+   * exclusion.
+   *
+   * This locking is not performed at the GPU display level, because that would cause lock
+   * inversion. */
+  if (!gl_context_enable()) {
+    return false;
+  }
+
+  if (gl_render_sync_) {
+    glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  if (!gl_texture_resources_ensure()) {
+    gl_context_disable();
+    return false;
+  }
+
+  /* Update texture dimensions if needed. */
+  if (texture_.width != texture_width || texture_.height != texture_height) {
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+    glTexImage2D(
+        GL_TEXTURE_2D, 0, GL_RGBA16F, texture_width, texture_height, 0, GL_RGBA, GL_HALF_FLOAT, 0);
+    texture_.width = texture_width;
+    texture_.height = texture_height;
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    /* Texture did change, and no pixel storage was provided. Tag for an explicit zeroing out to
+     * avoid undefined content. */
+    texture_.need_clear = true;
+  }
+
+  /* Update PBO dimensions if needed.
+   *
+   * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in,
+   * at a resolution divider 1. This was we don't need to recreate graphics interoperability
+   * objects which are costly and which are tied to the specific underlying buffer size.
+   * The downside of this approach is that when graphics interopeability is not used we are sending
+   * too much data to GPU when resolution divider is not 1. */
+  /* TODO(sergey): Investigate whether keeping the PBO exact size of the texute makes non-interop
+   * mode faster. */
+  const int buffer_width = params.full_size.x;
+  const int buffer_height = params.full_size.y;
+  if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) {
+    const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height;
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_in_bytes, 0, GL_DYNAMIC_DRAW);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    texture_.buffer_width = buffer_width;
+    texture_.buffer_height = buffer_height;
+  }
+
+  /* New content will be provided to the texture in one way or another, so mark this in a
+   * centralized place. */
+  texture_.need_update = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::do_update_end()
+{
+  gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+  glFlush();
+
+  gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void BlenderGPUDisplay::do_copy_pixels_to_texture(
+    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+  /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time
+   * point of view than to copy data directly to the OpenGL texture.
+   *
+   * The possible downside of this approach is that it might require a higher peak memory when
+   * doing partial updates of the texture (although, in practice even partial updates might peak
+   * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */
+
+  half4 *mapped_rgba_pixels = map_texture_buffer();
+  if (!mapped_rgba_pixels) {
+    return;
+  }
+
+  if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width &&
+      pixels_height == texture_.height) {
+    const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height;
+    memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
+  }
+  else {
+    const half4 *rgba_row = rgba_pixels;
+    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x;
+    for (int y = 0; y < pixels_height;
+         ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) {
+      memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
+    }
+  }
+
+  unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *BlenderGPUDisplay::do_map_texture_buffer()
+{
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+
+  half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>(
+      glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY));
+  if (!mapped_rgba_pixels) {
+    LOG(ERROR) << "Error mapping BlenderGPUDisplay pixel buffer object.";
+  }
+
+  if (texture_.need_clear) {
+    const int64_t texture_width = texture_.width;
+    const int64_t texture_height = texture_.height;
+    memset(reinterpret_cast<void *>(mapped_rgba_pixels),
+           0,
+           texture_width * texture_height * sizeof(half4));
+    texture_.need_clear = false;
+  }
+
+  return mapped_rgba_pixels;
+}
+
+void BlenderGPUDisplay::do_unmap_texture_buffer()
+{
+  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get()
+{
+  DeviceGraphicsInteropDestination interop_dst;
+
+  interop_dst.buffer_width = texture_.buffer_width;
+  interop_dst.buffer_height = texture_.buffer_height;
+  interop_dst.opengl_pbo_id = texture_.gl_pbo_id;
+
+  interop_dst.need_clear = texture_.need_clear;
+  texture_.need_clear = false;
+
+  return interop_dst;
+}
+
+void BlenderGPUDisplay::graphics_interop_activate()
+{
+  gl_context_enable();
+}
+
+void BlenderGPUDisplay::graphics_interop_deactivate()
+{
+  gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+void BlenderGPUDisplay::clear()
+{
+  texture_.need_clear = true;
+}
+
+void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
+{
+  /* See do_update_begin() for why no locking is required here. */
+  const bool transparent = true;  // TODO(sergey): Derive this from Film.
+
+  if (texture_.need_clear) {
+    /* Texture is requested to be cleared and was not yet cleared.
+     * Do early return which should be equivalent of drawing all-zero texture. */
+    return;
+  }
+
+  if (!gl_draw_resources_ensure()) {
+    return;
+  }
+
+  if (use_gl_context_) {
+    gl_context_mutex_.lock();
+  }
+
+  if (gl_upload_sync_) {
+    glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  if (transparent) {
+    glEnable(GL_BLEND);
+    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+  }
+
+  display_shader_->bind(params.full_size.x, params.full_size.y);
+
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+
+  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_);
+
+  texture_update_if_needed();
+  vertex_buffer_update(params);
+
+  /* TODO(sergey): Does it make sense/possible to cache/reuse the VAO? */
+  GLuint vertex_array_object;
+  glGenVertexArrays(1, &vertex_array_object);
+  glBindVertexArray(vertex_array_object);
+
+  const int texcoord_attribute = display_shader_->get_tex_coord_attrib_location();
+  const int position_attribute = display_shader_->get_position_attrib_location();
+
+  glEnableVertexAttribArray(texcoord_attribute);
+  glEnableVertexAttribArray(position_attribute);
+
+  glVertexAttribPointer(
+      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+  glVertexAttribPointer(position_attribute,
+                        2,
+                        GL_FLOAT,
+                        GL_FALSE,
+                        4 * sizeof(float),
+                        (const GLvoid *)(sizeof(float) * 2));
+
+  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  glDeleteVertexArrays(1, &vertex_array_object);
+
+  display_shader_->unbind();
+
+  if (transparent) {
+    glDisable(GL_BLEND);
+  }
+
+  gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+  glFlush();
+
+  if (use_gl_context_) {
+    gl_context_mutex_.unlock();
+  }
+}
+
+void BlenderGPUDisplay::gl_context_create()
+{
+  /* When rendering in viewport there is no render context available via engine.
+   * Check whether own context is to be created here.
+   *
+   * NOTE: If the `b_engine_`'s context is not available, we are expected to be on a main thread
+   * here. */
+  use_gl_context_ = !RE_engine_has_render_context(
+      reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+
+  if (use_gl_context_) {
+    const bool drw_state = DRW_opengl_context_release();
+    gl_context_ = WM_opengl_context_create();
+    if (gl_context_) {
+      /* On Windows an old context is restored after creation, and subsequent release of context
+       * generates a Win32 error. Harmless for users, but annoying to have possible misleading
+       * error prints in the console. */
+#ifndef _WIN32
+      WM_opengl_context_release(gl_context_);
+#endif
+    }
+    else {
+      LOG(ERROR) << "Error creating OpenGL context.";
+    }
+
+    DRW_opengl_context_activate(drw_state);
+  }
+}
+
+bool BlenderGPUDisplay::gl_context_enable()
+{
+  if (use_gl_context_) {
+    if (!gl_context_) {
+      return false;
+    }
+    gl_context_mutex_.lock();
+    WM_opengl_context_activate(gl_context_);
+    return true;
+  }
+
+  RE_engine_render_context_enable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+  return true;
+}
+
+void BlenderGPUDisplay::gl_context_disable()
+{
+  if (use_gl_context_) {
+    if (gl_context_) {
+      WM_opengl_context_release(gl_context_);
+      gl_context_mutex_.unlock();
+    }
+    return;
+  }
+
+  RE_engine_render_context_disable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+}
+
+void BlenderGPUDisplay::gl_context_dispose()
+{
+  if (gl_context_) {
+    const bool drw_state = DRW_opengl_context_release();
+
+    WM_opengl_context_activate(gl_context_);
+    WM_opengl_context_dispose(gl_context_);
+
+    DRW_opengl_context_activate(drw_state);
+  }
+}
+
+bool BlenderGPUDisplay::gl_draw_resources_ensure()
+{
+  if (!texture_.gl_id) {
+    /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can
+     * can not continue. Note that this is not an unrecoverable error, so once the texture is known
+     * we will come back here and create all the GPU resources needed for draw. */
+    return false;
+  }
+
+  if (gl_draw_resource_creation_attempted_) {
+    return gl_draw_resources_created_;
+  }
+  gl_draw_resource_creation_attempted_ = true;
+
+  if (!vertex_buffer_) {
+    glGenBuffers(1, &vertex_buffer_);
+    if (!vertex_buffer_) {
+      LOG(ERROR) << "Error creating vertex buffer.";
+      return false;
+    }
+  }
+
+  gl_draw_resources_created_ = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::gl_resources_destroy()
+{
+  gl_context_enable();
+
+  if (vertex_buffer_ != 0) {
+    glDeleteBuffers(1, &vertex_buffer_);
+  }
+
+  if (texture_.gl_pbo_id) {
+    glDeleteBuffers(1, &texture_.gl_pbo_id);
+    texture_.gl_pbo_id = 0;
+  }
+
+  if (texture_.gl_id) {
+    glDeleteTextures(1, &texture_.gl_id);
+    texture_.gl_id = 0;
+  }
+
+  gl_context_disable();
+
+  gl_context_dispose();
+}
+
+bool BlenderGPUDisplay::gl_texture_resources_ensure()
+{
+  if (texture_.creation_attempted) {
+    return texture_.is_created;
+  }
+  texture_.creation_attempted = true;
+
+  DCHECK(!texture_.gl_id);
+  DCHECK(!texture_.gl_pbo_id);
+
+  /* Create texture. */
+  glGenTextures(1, &texture_.gl_id);
+  if (!texture_.gl_id) {
+    LOG(ERROR) << "Error creating texture.";
+    return false;
+  }
+
+  /* Configure the texture. */
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  /* Create PBO for the texture. */
+  glGenBuffers(1, &texture_.gl_pbo_id);
+  if (!texture_.gl_pbo_id) {
+    LOG(ERROR) << "Error creating texture pixel buffer object.";
+    return false;
+  }
+
+  /* Creation finished with a success. */
+  texture_.is_created = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::texture_update_if_needed()
+{
+  if (!texture_.need_update) {
+    return;
+  }
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+  glTexSubImage2D(
+      GL_TEXTURE_2D, 0, 0, 0, texture_.width, texture_.height, GL_RGBA, GL_HALF_FLOAT, 0);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+  texture_.need_update = false;
+}
+
+void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams &params)
+{
+  /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
+   * rendered. */
+  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+  float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+  if (!vpointer) {
+    return;
+  }
+
+  vpointer[0] = 0.0f;
+  vpointer[1] = 0.0f;
+  vpointer[2] = params.offset.x;
+  vpointer[3] = params.offset.y;
+
+  vpointer[4] = 1.0f;
+  vpointer[5] = 0.0f;
+  vpointer[6] = (float)params.size.x + params.offset.x;
+  vpointer[7] = params.offset.y;
+
+  vpointer[8] = 1.0f;
+  vpointer[9] = 1.0f;
+  vpointer[10] = (float)params.size.x + params.offset.x;
+  vpointer[11] = (float)params.size.y + params.offset.y;
+
+  vpointer[12] = 0.0f;
+  vpointer[13] = 1.0f;
+  vpointer[14] = params.offset.x;
+  vpointer[15] = (float)params.size.y + params.offset.y;
+
+  glUnmapBuffer(GL_ARRAY_BUFFER);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_gpu_display.h
new file mode 100644
index 00000000000..b7eddf0afa7
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "MEM_guardedalloc.h"
+
+#include "RNA_blender_cpp.h"
+
+#include "render/gpu_display.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Base class of shader used for GPU display rendering. */
+class BlenderDisplayShader {
+ public:
+  static constexpr const char *position_attribute_name = "pos";
+  static constexpr const char *tex_coord_attribute_name = "texCoord";
+
+  /* Create shader implementation suitable for the given render engine and scene configuration. */
+  static unique_ptr<BlenderDisplayShader> create(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+  BlenderDisplayShader() = default;
+  virtual ~BlenderDisplayShader() = default;
+
+  virtual void bind(int width, int height) = 0;
+  virtual void unbind() = 0;
+
+  /* Get attribute location for position and texture coordinate respectively.
+   * NOTE: The shader needs to be bound to have access to those. */
+  virtual int get_position_attrib_location();
+  virtual int get_tex_coord_attrib_location();
+
+ protected:
+  /* Get program of this display shader.
+   * NOTE: The shader needs to be bound to have access to this. */
+  virtual uint get_shader_program() = 0;
+
+  /* Cached values of various OpenGL resources. */
+  int position_attribute_location_ = -1;
+  int tex_coord_attribute_location_ = -1;
+};
+
+/* Implementation of display rendering shader used in the case when render engine does not support
+ * display space shader. */
+class BlenderFallbackDisplayShader : public BlenderDisplayShader {
+ public:
+  virtual void bind(int width, int height) override;
+  virtual void unbind() override;
+
+ protected:
+  virtual uint get_shader_program() override;
+
+  void create_shader_if_needed();
+  void destroy_shader();
+
+  uint shader_program_ = 0;
+  int image_texture_location_ = -1;
+  int fullscreen_location_ = -1;
+
+  /* Shader compilation attempted. Which means, that if the shader program is 0 then compilation or
+   * linking has failed. Do not attempt to re-compile the shader. */
+  bool shader_compile_attempted_ = false;
+};
+
+class BlenderDisplaySpaceShader : public BlenderDisplayShader {
+ public:
+  BlenderDisplaySpaceShader(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+  virtual void bind(int width, int height) override;
+  virtual void unbind() override;
+
+ protected:
+  virtual uint get_shader_program() override;
+
+  BL::RenderEngine b_engine_;
+  BL::Scene &b_scene_;
+
+  /* Cached values of various OpenGL resources. */
+  uint shader_program_ = 0;
+};
+
+/* GPU display implementation which is specific for Blender viewport integration. */
+class BlenderGPUDisplay : public GPUDisplay {
+ public:
+  BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+  ~BlenderGPUDisplay();
+
+  virtual void graphics_interop_activate() override;
+  virtual void graphics_interop_deactivate() override;
+
+  virtual void clear() override;
+
+ protected:
+  virtual bool do_update_begin(const GPUDisplayParams &params,
+                               int texture_width,
+                               int texture_height) override;
+  virtual void do_update_end() override;
+
+  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+                                         int texture_x,
+                                         int texture_y,
+                                         int pixels_width,
+                                         int pixels_height) override;
+  virtual void do_draw(const GPUDisplayParams &params) override;
+
+  virtual half4 *do_map_texture_buffer() override;
+  virtual void do_unmap_texture_buffer() override;
+
+  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() override;
+
+  /* Helper function which allocates new GPU context. */
+  void gl_context_create();
+  bool gl_context_enable();
+  void gl_context_disable();
+  void gl_context_dispose();
+
+  /* Make sure texture is allocated and its initial configuration is performed. */
+  bool gl_texture_resources_ensure();
+
+  /* Ensure all runtime GPU resources needefd for drawing are allocated.
+   * Returns true if all resources needed for drawing are available. */
+  bool gl_draw_resources_ensure();
+
+  /* Destroy all GPU resources which are being used by this object. */
+  void gl_resources_destroy();
+
+  /* Update GPU texture dimensions and content if needed (new pixel data was provided).
+   *
+   * NOTE: The texture needs to be bound. */
+  void texture_update_if_needed();
+
+  /* Update vetrex buffer with new coordinates of vertex positions and texture coordinates.
+   * This buffer is used to render texture in the viewport.
+   *
+   * NOTE: The buffer needs to be bound. */
+  void vertex_buffer_update(const GPUDisplayParams &params);
+
+  BL::RenderEngine b_engine_;
+
+  /* OpenGL context which is used the render engine doesn't have its own. */
+  void *gl_context_ = nullptr;
+  /* The when Blender RenderEngine side context is not available and the GPUDisplay is to create
+   * its own context. */
+  bool use_gl_context_ = false;
+  /* Mutex used to guard the `gl_context_`. */
+  thread_mutex gl_context_mutex_;
+
+  /* Texture which contains pixels of the render result. */
+  struct {
+    /* Indicates whether texture creation was attempted and succeeded.
+     * Used to avoid multiple attempts of texture creation on GPU issues or GPU context
+     * misconfiguration. */
+    bool creation_attempted = false;
+    bool is_created = false;
+
+    /* OpenGL resource IDs of the texture itself and Pixel Buffer Object (PBO) used to write
+     * pixels to it.
+     *
+     * NOTE: Allocated on the engine's context. */
+    uint gl_id = 0;
+    uint gl_pbo_id = 0;
+
+    /* Is true when new data was written to the PBO, meaning, the texture might need to be resized
+     * and new data is to be uploaded to the GPU. */
+    bool need_update = false;
+
+    /* Content of the texture is to be filled with zeroes. */
+    std::atomic<bool> need_clear = true;
+
+    /* Dimensions of the texture in pixels. */
+    int width = 0;
+    int height = 0;
+
+    /* Dimensions of the underlying PBO. */
+    int buffer_width = 0;
+    int buffer_height = 0;
+  } texture_;
+
+  unique_ptr<BlenderDisplayShader> display_shader_;
+
+  /* Special track of whether GPU resources were attempted to be created, to avoid attempts of
+   * their re-creation on failure on every redraw. */
+  bool gl_draw_resource_creation_attempted_ = false;
+  bool gl_draw_resources_created_ = false;
+
+  /* Vertex buffer which hold vertrices of a triangle fan which is textures with the texture
+   * holding the render result. */
+  uint vertex_buffer_ = 0;
+
+  void *gl_render_sync_ = nullptr;
+  void *gl_upload_sync_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_light.cpp b/intern/cycles/blender/blender_light.cpp
index 542028f4b2f..4df1e720dde 100644
--- a/intern/cycles/blender/blender_light.cpp
+++ b/intern/cycles/blender/blender_light.cpp
@@ -125,17 +125,10 @@ void BlenderSync::sync_light(BL::Object &b_parent,
   light->set_shader(static_cast<Shader *>(used_shaders[0]));
 
   /* shadow */
-  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
   PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
   light->set_cast_shadow(get_boolean(clight, "cast_shadow"));
   light->set_use_mis(get_boolean(clight, "use_multiple_importance_sampling"));
 
-  int samples = get_int(clight, "samples");
-  if (get_boolean(cscene, "use_square_samples"))
-    light->set_samples(samples * samples);
-  else
-    light->set_samples(samples);
-
   light->set_max_bounces(get_int(clight, "max_bounces"));
 
   if (b_ob_info.real_object != b_ob_info.iter_object) {
@@ -155,10 +148,12 @@ void BlenderSync::sync_light(BL::Object &b_parent,
 
   /* visibility */
   uint visibility = object_ray_visibility(b_ob_info.real_object);
+  light->set_use_camera((visibility & PATH_RAY_CAMERA) != 0);
   light->set_use_diffuse((visibility & PATH_RAY_DIFFUSE) != 0);
   light->set_use_glossy((visibility & PATH_RAY_GLOSSY) != 0);
   light->set_use_transmission((visibility & PATH_RAY_TRANSMIT) != 0);
   light->set_use_scatter((visibility & PATH_RAY_VOLUME_SCATTER) != 0);
+  light->set_is_shadow_catcher(b_ob_info.real_object.is_shadow_catcher());
 
   /* tag */
   light->tag_update(scene);
@@ -169,7 +164,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
   BL::World b_world = b_scene.world();
 
   if (b_world) {
-    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
     PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
 
     enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
@@ -197,12 +191,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
         /* force enable light again when world is resynced */
         light->set_is_enabled(true);
 
-        int samples = get_int(cworld, "samples");
-        if (get_boolean(cscene, "use_square_samples"))
-          light->set_samples(samples * samples);
-        else
-          light->set_samples(samples);
-
         light->tag_update(scene);
         light_map.set_recalc(b_world);
       }
@@ -211,7 +199,7 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
 
   world_map = b_world.ptr.data;
   world_recalc = false;
-  viewport_parameters = BlenderViewportParameters(b_v3d);
+  viewport_parameters = BlenderViewportParameters(b_v3d, use_developer_ui);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 22d6edeb099..95da4a2df84 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -568,7 +568,7 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
   /* object loop */
   bool cancel = false;
   bool use_portal = false;
-  const bool show_lights = BlenderViewportParameters(b_v3d).use_scene_lights;
+  const bool show_lights = BlenderViewportParameters(b_v3d, use_developer_ui).use_scene_lights;
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
   BL::Depsgraph::object_instances_iterator b_instance_iter;
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 6e06b6a468f..694d8454422 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -45,10 +45,6 @@
 #  include <OSL/oslquery.h>
 #endif
 
-#ifdef WITH_OPENCL
-#  include "device/device_intern.h"
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 namespace {
@@ -72,12 +68,10 @@ PyObject *pyunicode_from_string(const char *str)
 /* Synchronize debug flags from a given Blender scene.
  * Return truth when device list needs invalidation.
  */
-bool debug_flags_sync_from_scene(BL::Scene b_scene)
+static void debug_flags_sync_from_scene(BL::Scene b_scene)
 {
   DebugFlagsRef flags = DebugFlags();
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-  /* Backup some settings for comparison. */
-  DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
   /* Synchronize shared flags. */
   flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type");
   /* Synchronize CPU flags. */
@@ -87,50 +81,19 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
   flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
   flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
   flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
-  flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
   /* Synchronize CUDA flags. */
   flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
-  flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
   /* Synchronize OptiX flags. */
-  flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
-  flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
-  /* Synchronize OpenCL device type. */
-  switch (get_enum(cscene, "debug_opencl_device_type")) {
-    case 0:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
-      break;
-    case 1:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ALL;
-      break;
-    case 2:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
-      break;
-    case 3:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_CPU;
-      break;
-    case 4:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_GPU;
-      break;
-    case 5:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
-      break;
-  }
-  /* Synchronize other OpenCL flags. */
-  flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
-  flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit")) * 1024 * 1024;
-  return flags.opencl.device_type != opencl_device_type;
+  flags.optix.use_debug = get_boolean(cscene, "debug_use_optix_debug");
 }
 
 /* Reset debug flags to default values.
  * Return truth when device list needs invalidation.
  */
-bool debug_flags_reset()
+static void debug_flags_reset()
 {
   DebugFlagsRef flags = DebugFlags();
-  /* Backup some settings for comparison. */
-  DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
   flags.reset();
-  return flags.opencl.device_type != opencl_device_type;
 }
 
 } /* namespace */
@@ -175,18 +138,20 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
 
 static PyObject *init_func(PyObject * /*self*/, PyObject *args)
 {
-  PyObject *path, *user_path;
+  PyObject *path, *user_path, *temp_path;
   int headless;
 
-  if (!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) {
-    return NULL;
+  if (!PyArg_ParseTuple(args, "OOOi", &path, &user_path, &temp_path, &headless)) {
+    return nullptr;
   }
 
-  PyObject *path_coerce = NULL, *user_path_coerce = NULL;
+  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr, *temp_path_coerce = nullptr;
   path_init(PyC_UnicodeAsByte(path, &path_coerce),
-            PyC_UnicodeAsByte(user_path, &user_path_coerce));
+            PyC_UnicodeAsByte(user_path, &user_path_coerce),
+            PyC_UnicodeAsByte(temp_path, &temp_path_coerce));
   Py_XDECREF(path_coerce);
   Py_XDECREF(user_path_coerce);
+  Py_XDECREF(temp_path_coerce);
 
   BlenderSession::headless = headless;
 
@@ -299,6 +264,50 @@ static PyObject *render_func(PyObject * /*self*/, PyObject *args)
   Py_RETURN_NONE;
 }
 
+static PyObject *render_frame_finish_func(PyObject * /*self*/, PyObject *args)
+{
+  PyObject *pysession;
+
+  if (!PyArg_ParseTuple(args, "O", &pysession)) {
+    return nullptr;
+  }
+
+  BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(pysession);
+
+  /* Allow Blender to execute other Python scripts. */
+  python_thread_state_save(&session->python_thread_state);
+
+  session->render_frame_finish();
+
+  python_thread_state_restore(&session->python_thread_state);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+{
+  PyObject *py_session, *py_graph, *py_screen, *py_space_image;
+
+  if (!PyArg_ParseTuple(args, "OOOO", &py_session, &py_graph, &py_screen, &py_space_image)) {
+    return nullptr;
+  }
+
+  BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(py_session);
+
+  ID *b_screen = (ID *)PyLong_AsVoidPtr(py_screen);
+
+  PointerRNA b_space_image_ptr;
+  RNA_pointer_create(b_screen,
+                     &RNA_SpaceImageEditor,
+                     pylong_as_voidptr_typesafe(py_space_image),
+                     &b_space_image_ptr);
+  BL::SpaceImageEditor b_space_image(b_space_image_ptr);
+
+  session->draw(b_space_image);
+
+  Py_RETURN_NONE;
+}
+
 /* pixel_array and result passed as pointers */
 static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
 {
@@ -336,7 +345,7 @@ static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
   Py_RETURN_NONE;
 }
 
-static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+static PyObject *view_draw_func(PyObject * /*self*/, PyObject *args)
 {
   PyObject *pysession, *pygraph, *pyv3d, *pyrv3d;
 
@@ -350,7 +359,7 @@ static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
     int viewport[4];
     glGetIntegerv(GL_VIEWPORT, viewport);
 
-    session->draw(viewport[2], viewport[3]);
+    session->view_draw(viewport[2], viewport[3]);
   }
 
   Py_RETURN_NONE;
@@ -697,40 +706,6 @@ static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/)
   return pyunicode_from_string(system_info.c_str());
 }
 
-#ifdef WITH_OPENCL
-static PyObject *opencl_disable_func(PyObject * /*self*/, PyObject * /*value*/)
-{
-  VLOG(2) << "Disabling OpenCL platform.";
-  DebugFlags().opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
-  Py_RETURN_NONE;
-}
-
-static PyObject *opencl_compile_func(PyObject * /*self*/, PyObject *args)
-{
-  PyObject *sequence = PySequence_Fast(args, "Arguments must be a sequence");
-  if (sequence == NULL) {
-    Py_RETURN_FALSE;
-  }
-
-  vector<string> parameters;
-  for (Py_ssize_t i = 0; i < PySequence_Fast_GET_SIZE(sequence); i++) {
-    PyObject *item = PySequence_Fast_GET_ITEM(sequence, i);
-    PyObject *item_as_string = PyObject_Str(item);
-    const char *parameter_string = PyUnicode_AsUTF8(item_as_string);
-    parameters.push_back(parameter_string);
-    Py_DECREF(item_as_string);
-  }
-  Py_DECREF(sequence);
-
-  if (device_opencl_compile_kernel(parameters)) {
-    Py_RETURN_TRUE;
-  }
-  else {
-    Py_RETURN_FALSE;
-  }
-}
-#endif
-
 static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepaths)
 {
   if (PyUnicode_Check(pyfilepaths)) {
@@ -762,6 +737,10 @@ static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepat
 
 static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *keywords)
 {
+#if 1
+  (void)args;
+  (void)keywords;
+#else
   static const char *keyword_list[] = {
       "preferences", "scene", "view_layer", "input", "output", "tile_size", "samples", NULL};
   PyObject *pypreferences, *pyscene, *pyviewlayer;
@@ -835,7 +814,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
   }
 
   /* Create denoiser. */
-  Denoiser denoiser(device);
+  DenoiserPipeline denoiser(device);
   denoiser.params = params;
   denoiser.input = input;
   denoiser.output = output;
@@ -852,6 +831,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
     PyErr_SetString(PyExc_ValueError, denoiser.error.c_str());
     return NULL;
   }
+#endif
 
   Py_RETURN_NONE;
 }
@@ -903,10 +883,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
   RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr);
   BL::Scene b_scene(sceneptr);
 
-  if (debug_flags_sync_from_scene(b_scene)) {
-    VLOG(2) << "Tagging device list for update.";
-    Device::tag_update();
-  }
+  debug_flags_sync_from_scene(b_scene);
 
   VLOG(2) << "Debug flags set to:\n" << DebugFlags();
 
@@ -917,10 +894,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
 
 static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/)
 {
-  if (debug_flags_reset()) {
-    VLOG(2) << "Tagging device list for update.";
-    Device::tag_update();
-  }
+  debug_flags_reset();
   if (debug_flags_set) {
     VLOG(2) << "Debug flags reset to:\n" << DebugFlags();
     debug_flags_set = false;
@@ -928,84 +902,6 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
   Py_RETURN_NONE;
 }
 
-static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
-{
-  int num_resumable_chunks, current_resumable_chunk;
-  if (!PyArg_ParseTuple(args, "ii", &num_resumable_chunks, &current_resumable_chunk)) {
-    Py_RETURN_NONE;
-  }
-
-  if (num_resumable_chunks <= 0) {
-    fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (current_resumable_chunk < 1 || current_resumable_chunk > num_resumable_chunks) {
-    fprintf(stderr, "Cycles: Bad value for current resumable chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-
-  VLOG(1) << "Initialized resumable render: "
-          << "num_resumable_chunks=" << num_resumable_chunks << ", "
-          << "current_resumable_chunk=" << current_resumable_chunk;
-  BlenderSession::num_resumable_chunks = num_resumable_chunks;
-  BlenderSession::current_resumable_chunk = current_resumable_chunk;
-
-  printf("Cycles: Will render chunk %d of %d\n", current_resumable_chunk, num_resumable_chunks);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
-{
-  int num_chunks, start_chunk, end_chunk;
-  if (!PyArg_ParseTuple(args, "iii", &num_chunks, &start_chunk, &end_chunk)) {
-    Py_RETURN_NONE;
-  }
-
-  if (num_chunks <= 0) {
-    fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (start_chunk < 1 || start_chunk > num_chunks) {
-    fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (end_chunk < 1 || end_chunk > num_chunks) {
-    fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (start_chunk > end_chunk) {
-    fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-
-  VLOG(1) << "Initialized resumable render: "
-          << "num_resumable_chunks=" << num_chunks << ", "
-          << "start_resumable_chunk=" << start_chunk << "end_resumable_chunk=" << end_chunk;
-  BlenderSession::num_resumable_chunks = num_chunks;
-  BlenderSession::start_resumable_chunk = start_chunk;
-  BlenderSession::end_resumable_chunk = end_chunk;
-
-  printf("Cycles: Will render chunks %d to %d of %d\n", start_chunk, end_chunk, num_chunks);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *clear_resumable_chunk_func(PyObject * /*self*/, PyObject * /*value*/)
-{
-  VLOG(1) << "Clear resumable render";
-  BlenderSession::num_resumable_chunks = 0;
-  BlenderSession::current_resumable_chunk = 0;
-
-  Py_RETURN_NONE;
-}
-
 static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   BlenderSession::print_render_stats = true;
@@ -1015,16 +911,14 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false, has_opencl = false;
+  bool has_cuda = false, has_optix = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
     has_optix |= (device_type == DEVICE_OPTIX);
-    has_opencl |= (device_type == DEVICE_OPENCL);
   }
-  PyObject *list = PyTuple_New(3);
+  PyObject *list = PyTuple_New(2);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
   PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
-  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl));
   return list;
 }
 
@@ -1044,9 +938,6 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
   if (override == "CPU") {
     BlenderSession::device_override = DEVICE_MASK_CPU;
   }
-  else if (override == "OPENCL") {
-    BlenderSession::device_override = DEVICE_MASK_OPENCL;
-  }
   else if (override == "CUDA") {
     BlenderSession::device_override = DEVICE_MASK_CUDA;
   }
@@ -1072,8 +963,10 @@ static PyMethodDef methods[] = {
     {"create", create_func, METH_VARARGS, ""},
     {"free", free_func, METH_O, ""},
     {"render", render_func, METH_VARARGS, ""},
-    {"bake", bake_func, METH_VARARGS, ""},
+    {"render_frame_finish", render_frame_finish_func, METH_VARARGS, ""},
     {"draw", draw_func, METH_VARARGS, ""},
+    {"bake", bake_func, METH_VARARGS, ""},
+    {"view_draw", view_draw_func, METH_VARARGS, ""},
     {"sync", sync_func, METH_VARARGS, ""},
     {"reset", reset_func, METH_VARARGS, ""},
 #ifdef WITH_OSL
@@ -1082,10 +975,6 @@ static PyMethodDef methods[] = {
 #endif
     {"available_devices", available_devices_func, METH_VARARGS, ""},
     {"system_info", system_info_func, METH_NOARGS, ""},
-#ifdef WITH_OPENCL
-    {"opencl_disable", opencl_disable_func, METH_NOARGS, ""},
-    {"opencl_compile", opencl_compile_func, METH_VARARGS, ""},
-#endif
 
     /* Standalone denoising */
     {"denoise", (PyCFunction)denoise_func, METH_VARARGS | METH_KEYWORDS, ""},
@@ -1098,11 +987,6 @@ static PyMethodDef methods[] = {
     /* Statistics. */
     {"enable_print_stats", enable_print_stats_func, METH_NOARGS, ""},
 
-    /* Resumable render */
-    {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
-    {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
-    {"clear_resumable_chunk", clear_resumable_chunk_func, METH_NOARGS, ""},
-
     /* Compute Device selection */
     {"get_device_types", get_device_types_func, METH_VARARGS, ""},
     {"set_device_override", set_device_override_func, METH_O, ""},
@@ -1153,14 +1037,6 @@ void *CCL_python_module_init()
   PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
 #endif
 
-#ifdef WITH_NETWORK
-  PyModule_AddObject(mod, "with_network", Py_True);
-  Py_INCREF(Py_True);
-#else  /* WITH_NETWORK */
-  PyModule_AddObject(mod, "with_network", Py_False);
-  Py_INCREF(Py_False);
-#endif /* WITH_NETWORK */
-
 #ifdef WITH_EMBREE
   PyModule_AddObject(mod, "with_embree", Py_True);
   Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 29de886e4ff..5aafa605526 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -38,9 +38,11 @@
 #include "util/util_hash.h"
 #include "util/util_logging.h"
 #include "util/util_murmurhash.h"
+#include "util/util_path.h"
 #include "util/util_progress.h"
 #include "util/util_time.h"
 
+#include "blender/blender_gpu_display.h"
 #include "blender/blender_session.h"
 #include "blender/blender_sync.h"
 #include "blender/blender_util.h"
@@ -49,10 +51,6 @@ CCL_NAMESPACE_BEGIN
 
 DeviceTypeMask BlenderSession::device_override = DEVICE_MASK_ALL;
 bool BlenderSession::headless = false;
-int BlenderSession::num_resumable_chunks = 0;
-int BlenderSession::current_resumable_chunk = 0;
-int BlenderSession::start_resumable_chunk = 0;
-int BlenderSession::end_resumable_chunk = 0;
 bool BlenderSession::print_render_stats = false;
 
 BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
@@ -103,7 +101,9 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
       width(width),
       height(height),
       preview_osl(false),
-      python_thread_state(NULL)
+      python_thread_state(NULL),
+      use_developer_ui(b_userpref.experimental().use_cycles_debug() &&
+                       b_userpref.view().show_developer_ui())
 {
   /* 3d view render */
   background = false;
@@ -119,10 +119,10 @@ BlenderSession::~BlenderSession()
 
 void BlenderSession::create_session()
 {
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
-  bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
   /* reset status/progress */
   last_status = "";
@@ -131,20 +131,18 @@ void BlenderSession::create_session()
   start_resize_time = 0.0;
 
   /* create session */
-  session = new Session(session_params);
-  session->scene = scene;
+  session = new Session(session_params, scene_params);
   session->progress.set_update_callback(function_bind(&BlenderSession::tag_redraw, this));
   session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this));
   session->set_pause(session_pause);
 
   /* create scene */
-  scene = new Scene(scene_params, session->device);
+  scene = session->scene;
   scene->name = b_scene.name();
 
-  session->scene = scene;
-
   /* create sync */
-  sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+  sync = new BlenderSync(
+      b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
   BL::Object b_camera_override(b_engine.camera_override());
   if (b_v3d) {
     sync->sync_view(b_v3d, b_rv3d, width, height);
@@ -154,13 +152,23 @@ void BlenderSession::create_session()
   }
 
   /* set buffer parameters */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-  session->reset(buffer_params, session_params.samples);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_v3d, b_rv3d, scene->camera, width, height);
+  session->reset(session_params, buffer_params);
 
-  b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+  /* Create GPU display. */
+  if (!b_engine.is_preview() && !headless) {
+    session->set_gpu_display(make_unique<BlenderGPUDisplay>(b_engine, b_scene));
+  }
 
-  update_resumable_tile_manager(session_params.samples);
+  /* Viewport and preview (as in, material preview) does not do tiled rendering, so can inform
+   * engine that no tracking of the tiles state is needed.
+   * The offline rendering will make a decision when tile is being written. The penalty of asking
+   * the engine to keep track of tiles state is minimal, so there is nothing to worry about here
+   * about possible single-tiled final render. */
+  if (!b_engine.is_preview() && !b_v3d) {
+    b_engine.use_highlight_tiles(true);
+  }
 }
 
 void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsgraph)
@@ -202,9 +210,9 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
     return;
   }
 
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 
   if (scene->params.modified(scene_params) || session->params.modified(session_params) ||
       !this->b_render.use_persistent_data()) {
@@ -220,8 +228,6 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   session->progress.reset();
 
-  session->tile_manager.set_tile_order(session_params.tile_order);
-
   /* peak memory usage should show current render peak, not peak for all renders
    * made by this render session
    */
@@ -230,7 +236,8 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
   if (is_new_session) {
     /* Sync object should be re-created for new scene. */
     delete sync;
-    sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+    sync = new BlenderSync(
+        b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
   }
   else {
     /* Sync recalculations to do just the required updates. */
@@ -242,103 +249,85 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL);
   BL::RegionView3D b_null_region_view3d(PointerRNA_NULL);
-  BufferParams buffer_params = BlenderSync::get_buffer_params(b_null_space_view3d,
-                                                              b_null_region_view3d,
-                                                              scene->camera,
-                                                              width,
-                                                              height,
-                                                              session_params.denoising.use);
-  session->reset(buffer_params, session_params.samples);
-
-  b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
+  session->reset(session_params, buffer_params);
 
   /* reset time */
   start_resize_time = 0.0;
+
+  {
+    thread_scoped_lock lock(draw_state_.mutex);
+    draw_state_.last_pass_index = -1;
+  }
 }
 
 void BlenderSession::free_session()
 {
-  session->cancel();
+  if (session) {
+    session->cancel(true);
+  }
 
   delete sync;
+  sync = nullptr;
+
   delete session;
+  session = nullptr;
 }
 
-static ShaderEvalType get_shader_type(const string &pass_type)
+void BlenderSession::read_render_tile()
 {
-  const char *shader_type = pass_type.c_str();
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
 
-  /* data passes */
-  if (strcmp(shader_type, "NORMAL") == 0)
-    return SHADER_EVAL_NORMAL;
-  else if (strcmp(shader_type, "UV") == 0)
-    return SHADER_EVAL_UV;
-  else if (strcmp(shader_type, "ROUGHNESS") == 0)
-    return SHADER_EVAL_ROUGHNESS;
-  else if (strcmp(shader_type, "DIFFUSE_COLOR") == 0)
-    return SHADER_EVAL_DIFFUSE_COLOR;
-  else if (strcmp(shader_type, "GLOSSY_COLOR") == 0)
-    return SHADER_EVAL_GLOSSY_COLOR;
-  else if (strcmp(shader_type, "TRANSMISSION_COLOR") == 0)
-    return SHADER_EVAL_TRANSMISSION_COLOR;
-  else if (strcmp(shader_type, "EMIT") == 0)
-    return SHADER_EVAL_EMISSION;
+  /* get render result */
+  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+                                                tile_offset.y,
+                                                tile_size.x,
+                                                tile_size.y,
+                                                b_rlay_name.c_str(),
+                                                b_rview_name.c_str());
 
-  /* light passes */
-  else if (strcmp(shader_type, "AO") == 0)
-    return SHADER_EVAL_AO;
-  else if (strcmp(shader_type, "COMBINED") == 0)
-    return SHADER_EVAL_COMBINED;
-  else if (strcmp(shader_type, "SHADOW") == 0)
-    return SHADER_EVAL_SHADOW;
-  else if (strcmp(shader_type, "DIFFUSE") == 0)
-    return SHADER_EVAL_DIFFUSE;
-  else if (strcmp(shader_type, "GLOSSY") == 0)
-    return SHADER_EVAL_GLOSSY;
-  else if (strcmp(shader_type, "TRANSMISSION") == 0)
-    return SHADER_EVAL_TRANSMISSION;
+  /* can happen if the intersected rectangle gives 0 width or height */
+  if (b_rr.ptr.data == NULL) {
+    return;
+  }
 
-  /* extra */
-  else if (strcmp(shader_type, "ENVIRONMENT") == 0)
-    return SHADER_EVAL_ENVIRONMENT;
+  BL::RenderResult::layers_iterator b_single_rlay;
+  b_rr.layers.begin(b_single_rlay);
 
-  else
-    return SHADER_EVAL_BAKE;
-}
+  /* layer will be missing if it was disabled in the UI */
+  if (b_single_rlay == b_rr.layers.end())
+    return;
 
-static BL::RenderResult begin_render_result(BL::RenderEngine &b_engine,
-                                            int x,
-                                            int y,
-                                            int w,
-                                            int h,
-                                            const char *layername,
-                                            const char *viewname)
-{
-  return b_engine.begin_result(x, y, w, h, layername, viewname);
-}
+  BL::RenderLayer b_rlay = *b_single_rlay;
 
-static void end_render_result(BL::RenderEngine &b_engine,
-                              BL::RenderResult &b_rr,
-                              bool cancel,
-                              bool highlight,
-                              bool do_merge_results)
-{
-  b_engine.end_result(b_rr, (int)cancel, (int)highlight, (int)do_merge_results);
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+  /* Copy each pass.
+   * TODO:copy only the required ones for better performance? */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
+  }
 }
 
-void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
-                                                 bool do_update_only,
-                                                 bool do_read_only,
-                                                 bool highlight)
+void BlenderSession::write_render_tile()
 {
-  int x = rtile.x - session->tile_manager.params.full_x;
-  int y = rtile.y - session->tile_manager.params.full_y;
-  int w = rtile.w;
-  int h = rtile.h;
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
+
+  const string_view render_layer_name = session->get_render_tile_layer();
+  const string_view render_view_name = session->get_render_tile_view();
+
+  b_engine.tile_highlight_clear_all();
 
   /* get render result */
-  BL::RenderResult b_rr = begin_render_result(
-      b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str());
+  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+                                                tile_offset.y,
+                                                tile_size.x,
+                                                tile_size.y,
+                                                render_layer_name.c_str(),
+                                                render_view_name.c_str());
 
   /* can happen if the intersected rectangle gives 0 width or height */
   if (b_rr.ptr.data == NULL) {
@@ -349,64 +338,34 @@ void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
   b_rr.layers.begin(b_single_rlay);
 
   /* layer will be missing if it was disabled in the UI */
-  if (b_single_rlay == b_rr.layers.end())
+  if (b_single_rlay == b_rr.layers.end()) {
     return;
+  }
 
   BL::RenderLayer b_rlay = *b_single_rlay;
 
-  if (do_read_only) {
-    /* copy each pass */
-    for (BL::RenderPass &b_pass : b_rlay.passes) {
-      /* find matching pass type */
-      PassType pass_type = BlenderSync::get_pass_type(b_pass);
-      int components = b_pass.channels();
-
-      rtile.buffers->set_pass_rect(
-          pass_type, components, (float *)b_pass.rect(), rtile.num_samples);
-    }
-
-    end_render_result(b_engine, b_rr, false, false, false);
-  }
-  else if (do_update_only) {
-    /* Sample would be zero at initial tile update, which is only needed
-     * to tag tile form blender side as IN PROGRESS for proper highlight
-     * no buffers should be sent to blender yet. For denoise we also
-     * keep showing the noisy buffers until denoise is done. */
-    bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE);
+  write_render_result(b_rlay);
 
-    if (merge) {
-      update_render_result(b_rlay, rtile);
-    }
-
-    end_render_result(b_engine, b_rr, true, highlight, merge);
-  }
-  else {
-    /* Write final render result. */
-    write_render_result(b_rlay, rtile);
-    end_render_result(b_engine, b_rr, false, false, true);
-  }
+  b_engine.end_result(b_rr, true, false, true);
 }
 
-void BlenderSession::read_render_tile(RenderTile &rtile)
+void BlenderSession::update_render_tile()
 {
-  do_write_update_render_tile(rtile, false, true, false);
-}
+  if (!session->has_multiple_render_tiles()) {
+    /* Don't highlight full-frame tile. */
+    return;
+  }
 
-void BlenderSession::write_render_tile(RenderTile &rtile)
-{
-  do_write_update_render_tile(rtile, false, false, false);
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
+
+  b_engine.tile_highlight_clear_all();
+  b_engine.tile_highlight_set(tile_offset.x, tile_offset.y, tile_size.x, tile_size.y, true);
 }
 
-void BlenderSession::update_render_tile(RenderTile &rtile, bool highlight)
+void BlenderSession::full_buffer_written(string_view filename)
 {
-  /* use final write for preview renders, otherwise render result wouldn't be
-   * be updated in blender side
-   * would need to be investigated a bit further, but for now shall be fine
-   */
-  if (!b_engine.is_preview())
-    do_write_update_render_tile(rtile, true, false, highlight);
-  else
-    do_write_update_render_tile(rtile, false, false, false);
+  full_buffer_files_.emplace_back(filename);
 }
 
 static void add_cryptomatte_layer(BL::RenderResult &b_rr, string name, string manifest)
@@ -430,12 +389,15 @@ void BlenderSession::stamp_view_layer_metadata(Scene *scene, const string &view_
                             to_string(session->params.samples).c_str());
 
   /* Store ranged samples information. */
+  /* TODO(sergey): Need to bring this information back. */
+#if 0
   if (session->tile_manager.range_num_samples != -1) {
     b_rr.stamp_data_add_field((prefix + "range_start_sample").c_str(),
                               to_string(session->tile_manager.range_start_sample).c_str());
     b_rr.stamp_data_add_field((prefix + "range_num_samples").c_str(),
                               to_string(session->tile_manager.range_num_samples).c_str());
   }
+#endif
 
   /* Write cryptomatte metadata. */
   if (scene->film->get_cryptomatte_passes() & CRYPT_OBJECT) {
@@ -475,38 +437,44 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   }
 
   /* set callback to write out render results */
-  session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
-  session->update_render_tile_cb = function_bind(
-      &BlenderSession::update_render_tile, this, _1, _2);
+  session->write_render_tile_cb = [&]() { write_render_tile(); };
+
+  /* Use final write for preview renders, otherwise render result wouldn't be be updated on Blender
+   * side. */
+  /* TODO(sergey): Investigate whether GPUDisplay can be used for the preview as well. */
+  if (b_engine.is_preview()) {
+    session->update_render_tile_cb = [&]() { write_render_tile(); };
+  }
+  else {
+    session->update_render_tile_cb = [&]() { update_render_tile(); };
+  }
+
+  session->full_buffer_written_cb = [&](string_view filename) { full_buffer_written(filename); };
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
 
   /* get buffer parameters */
-  SessionParams session_params = BlenderSync::get_session_params(
-      b_engine, b_userpref, b_scene, background, b_view_layer);
+  const SessionParams session_params = BlenderSync::get_session_params(
+      b_engine, b_userpref, b_scene, background);
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
+      b_v3d, b_rv3d, scene->camera, width, height);
 
   /* temporary render result to find needed passes and views */
-  BL::RenderResult b_rr = begin_render_result(
-      b_engine, 0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
+  BL::RenderResult b_rr = b_engine.begin_result(0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
   BL::RenderResult::layers_iterator b_single_rlay;
   b_rr.layers.begin(b_single_rlay);
   BL::RenderLayer b_rlay = *b_single_rlay;
-  b_rlay_name = b_view_layer.name();
 
-  /* Update denoising parameters. */
-  session->set_denoising(session_params.denoising);
+  {
+    thread_scoped_lock lock(draw_state_.mutex);
+    b_rlay_name = b_view_layer.name();
 
-  /* Compute render passes and film settings. */
-  vector<Pass> passes = sync->sync_render_passes(
-      b_scene, b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising);
+    /* Signal that the display pass is to be updated. */
+    draw_state_.last_pass_index = -1;
+  }
 
-  /* Set buffer params, using film settings from sync_render_passes. */
-  buffer_params.passes = passes;
-  buffer_params.denoising_data_pass = scene->film->get_denoising_data_pass();
-  buffer_params.denoising_clean_pass = scene->film->get_denoising_clean_pass();
-  buffer_params.denoising_prefiltered_pass = scene->film->get_denoising_prefiltered_pass();
+  /* Compute render passes and film settings. */
+  sync->sync_render_passes(b_rlay, b_view_layer);
 
   BL::RenderResult::views_iterator b_view_iter;
 
@@ -520,6 +488,9 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
        ++b_view_iter, ++view_index) {
     b_rview_name = b_view_iter->name();
 
+    buffer_params.layer = b_view_layer.name();
+    buffer_params.view = b_rview_name;
+
     /* set the current view */
     b_engine.active_view_set(b_rview_name.c_str());
 
@@ -549,20 +520,16 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     }
 
     /* Update number of samples per layer. */
-    int samples = sync->get_layer_samples();
-    bool bound_samples = sync->get_layer_bound_samples();
-    int effective_layer_samples;
+    const int samples = sync->get_layer_samples();
+    const bool bound_samples = sync->get_layer_bound_samples();
 
-    if (samples != 0 && (!bound_samples || (samples < session_params.samples)))
-      effective_layer_samples = samples;
-    else
-      effective_layer_samples = session_params.samples;
-
-    /* Update tile manager if we're doing resumable render. */
-    update_resumable_tile_manager(effective_layer_samples);
+    SessionParams effective_session_params = session_params;
+    if (samples != 0 && (!bound_samples || (samples < session_params.samples))) {
+      effective_session_params.samples = samples;
+    }
 
     /* Update session itself. */
-    session->reset(buffer_params, effective_layer_samples);
+    session->reset(effective_session_params, buffer_params);
 
     /* render */
     if (!b_engine.is_preview() && background && print_render_stats) {
@@ -586,65 +553,146 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   stamp_view_layer_metadata(scene, b_rlay_name);
 
   /* free result without merging */
-  end_render_result(b_engine, b_rr, true, true, false);
+  b_engine.end_result(b_rr, true, false, false);
 
   double total_time, render_time;
   session->progress.get_time(total_time, render_time);
   VLOG(1) << "Total render time: " << total_time;
   VLOG(1) << "Render time (without synchronization): " << render_time;
+}
+
+void BlenderSession::render_frame_finish()
+{
+  /* Processing of all layers and views is done. Clear the strings so that we can communicate
+   * progress about reading files and denoising them. */
+  b_rlay_name = "";
+  b_rview_name = "";
+
+  if (!b_render.use_persistent_data()) {
+    /* Free the sync object so that it can properly dereference nodes from the scene graph before
+     * the graph is freed. */
+    delete sync;
+    sync = nullptr;
+
+    session->device_free();
+  }
+
+  for (string_view filename : full_buffer_files_) {
+    session->process_full_buffer_from_disk(filename);
+    path_remove(filename);
+  }
 
   /* clear callback */
   session->write_render_tile_cb = function_null;
   session->update_render_tile_cb = function_null;
+  session->full_buffer_written_cb = function_null;
 }
 
-static int bake_pass_filter_get(const int pass_filter)
+static PassType bake_type_to_pass(const string &bake_type_str, const int bake_filter)
 {
-  int flag = BAKE_FILTER_NONE;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_DIRECT) != 0)
-    flag |= BAKE_FILTER_DIRECT;
-  if ((pass_filter & BL::BakeSettings::pass_filter_INDIRECT) != 0)
-    flag |= BAKE_FILTER_INDIRECT;
-  if ((pass_filter & BL::BakeSettings::pass_filter_COLOR) != 0)
-    flag |= BAKE_FILTER_COLOR;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0)
-    flag |= BAKE_FILTER_DIFFUSE;
-  if ((pass_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0)
-    flag |= BAKE_FILTER_GLOSSY;
-  if ((pass_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0)
-    flag |= BAKE_FILTER_TRANSMISSION;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_EMIT) != 0)
-    flag |= BAKE_FILTER_EMISSION;
-  if ((pass_filter & BL::BakeSettings::pass_filter_AO) != 0)
-    flag |= BAKE_FILTER_AO;
-
-  return flag;
+  const char *bake_type = bake_type_str.c_str();
+
+  /* data passes */
+  if (strcmp(bake_type, "POSITION") == 0) {
+    return PASS_POSITION;
+  }
+  else if (strcmp(bake_type, "NORMAL") == 0) {
+    return PASS_NORMAL;
+  }
+  else if (strcmp(bake_type, "UV") == 0) {
+    return PASS_UV;
+  }
+  else if (strcmp(bake_type, "ROUGHNESS") == 0) {
+    return PASS_ROUGHNESS;
+  }
+  else if (strcmp(bake_type, "EMIT") == 0) {
+    return PASS_EMISSION;
+  }
+  /* light passes */
+  else if (strcmp(bake_type, "AO") == 0) {
+    return PASS_AO;
+  }
+  else if (strcmp(bake_type, "COMBINED") == 0) {
+    return PASS_COMBINED;
+  }
+  else if (strcmp(bake_type, "SHADOW") == 0) {
+    return PASS_SHADOW;
+  }
+  else if (strcmp(bake_type, "DIFFUSE") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_DIFFUSE;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_DIFFUSE_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_DIFFUSE_INDIRECT;
+    }
+    else {
+      return PASS_DIFFUSE_COLOR;
+    }
+  }
+  else if (strcmp(bake_type, "GLOSSY") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_GLOSSY;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_GLOSSY_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_GLOSSY_INDIRECT;
+    }
+    else {
+      return PASS_GLOSSY_COLOR;
+    }
+  }
+  else if (strcmp(bake_type, "TRANSMISSION") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_TRANSMISSION;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_TRANSMISSION_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_TRANSMISSION_INDIRECT;
+    }
+    else {
+      return PASS_TRANSMISSION_COLOR;
+    }
+  }
+  /* extra */
+  else if (strcmp(bake_type, "ENVIRONMENT") == 0) {
+    return PASS_BACKGROUND;
+  }
+
+  return PASS_COMBINED;
 }
 
 void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
                           BL::Object &b_object,
-                          const string &pass_type,
-                          const int pass_filter,
+                          const string &bake_type,
+                          const int bake_filter,
                           const int bake_width,
                           const int bake_height)
 {
   b_depsgraph = b_depsgraph_;
 
-  ShaderEvalType shader_type = get_shader_type(pass_type);
-  int bake_pass_filter = bake_pass_filter_get(pass_filter);
-
   /* Initialize bake manager, before we load the baking kernels. */
-  scene->bake_manager->set(scene, b_object.name(), shader_type, bake_pass_filter);
+  scene->bake_manager->set(scene, b_object.name());
 
-  /* Passes are identified by name, so in order to return the combined pass we need to set the
-   * name. */
-  Pass::add(PASS_COMBINED, scene->passes, "Combined");
+  /* Add render pass that we want to bake, and name it Combined so that it is
+   * used as that on the Blender side. */
+  Pass *pass = scene->create_node<Pass>();
+  pass->set_name(ustring("Combined"));
+  pass->set_type(bake_type_to_pass(bake_type, bake_filter));
+  pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));
 
-  session->read_bake_tile_cb = function_bind(&BlenderSession::read_render_tile, this, _1);
-  session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
+  session->read_render_tile_cb = [&]() { read_render_tile(); };
+  session->write_render_tile_cb = [&]() { write_render_tile(); };
+  session->set_gpu_display(nullptr);
 
   if (!session->progress.get_cancel()) {
     /* Sync scene. */
@@ -667,18 +715,15 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
 
   if (object_found && !session->progress.get_cancel()) {
     /* Get session and buffer parameters. */
-    SessionParams session_params = BlenderSync::get_session_params(
+    const SessionParams session_params = BlenderSync::get_session_params(
         b_engine, b_userpref, b_scene, background);
-    session_params.progressive_refine = false;
 
     BufferParams buffer_params;
     buffer_params.width = bake_width;
     buffer_params.height = bake_height;
-    buffer_params.passes = scene->passes;
 
     /* Update session. */
-    session->tile_manager.set_samples(session_params.samples);
-    session->reset(buffer_params, session_params.samples);
+    session->reset(session_params, buffer_params);
 
     session->progress.set_update_callback(
         function_bind(&BlenderSession::update_bake_progress, this));
@@ -690,71 +735,43 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
     session->wait();
   }
 
-  session->read_bake_tile_cb = function_null;
+  session->read_render_tile_cb = function_null;
   session->write_render_tile_cb = function_null;
 }
 
-void BlenderSession::do_write_update_render_result(BL::RenderLayer &b_rlay,
-                                                   RenderTile &rtile,
-                                                   bool do_update_only)
+void BlenderSession::write_render_result(BL::RenderLayer &b_rlay)
 {
-  RenderBuffers *buffers = rtile.buffers;
-
-  /* copy data from device */
-  if (!buffers->copy_from_device())
+  if (!session->copy_render_tile_from_device()) {
     return;
-
-  float exposure = scene->film->get_exposure();
-
-  vector<float> pixels(rtile.w * rtile.h * 4);
-
-  /* Adjust absolute sample number to the range. */
-  int sample = rtile.sample;
-  const int range_start_sample = session->tile_manager.range_start_sample;
-  if (range_start_sample != -1) {
-    sample -= range_start_sample;
   }
 
-  if (!do_update_only) {
-    /* copy each pass */
-    for (BL::RenderPass &b_pass : b_rlay.passes) {
-      int components = b_pass.channels();
-
-      /* Copy pixels from regular render passes. */
-      bool read = buffers->get_pass_rect(b_pass.name(), exposure, sample, components, &pixels[0]);
-
-      /* If denoising pass, */
-      if (!read) {
-        int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
-        if (denoising_offset >= 0) {
-          read = buffers->get_denoising_pass_rect(
-              denoising_offset, exposure, sample, components, &pixels[0]);
-        }
-      }
+  const int2 tile_size = session->get_render_tile_size();
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
 
-      if (!read) {
-        memset(&pixels[0], 0, pixels.size() * sizeof(float));
-      }
-
-      b_pass.rect(&pixels[0]);
+  /* Copy each pass. */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    if (!session->get_render_tile_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) {
+      memset(&pixels[0], 0, pixels.size() * sizeof(float));
     }
-  }
-  else {
-    /* copy combined pass */
-    BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
-    if (buffers->get_pass_rect("Combined", exposure, sample, 4, &pixels[0]))
-      b_combined_pass.rect(&pixels[0]);
+
+    b_pass.rect(&pixels[0]);
   }
 }
 
-void BlenderSession::write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
+void BlenderSession::update_render_result(BL::RenderLayer &b_rlay)
 {
-  do_write_update_render_result(b_rlay, rtile, false);
-}
+  if (!session->copy_render_tile_from_device()) {
+    return;
+  }
 
-void BlenderSession::update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
-{
-  do_write_update_render_result(b_rlay, rtile, true);
+  const int2 tile_size = session->get_render_tile_size();
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+  /* Copy combined pass. */
+  BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
+  if (session->get_render_tile_pixels("Combined", b_combined_pass.channels(), &pixels[0])) {
+    b_combined_pass.rect(&pixels[0]);
+  }
 }
 
 void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
@@ -764,19 +781,19 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
     return;
 
   /* on session/scene parameter changes, we recreate session entirely */
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
-  bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
   if (session->params.modified(session_params) || scene->params.modified(scene_params)) {
     free_session();
     create_session();
   }
 
-  /* increase samples, but never decrease */
+  /* increase samples and render time, but never decrease */
   session->set_samples(session_params.samples);
-  session->set_denoising_start_sample(session_params.denoising.start_sample);
+  session->set_time_limit(session_params.time_limit);
   session->set_pause(session_pause);
 
   /* copy recalc flags, outside of mutex so we can decide to do the real
@@ -808,21 +825,12 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
     sync->sync_camera(b_render, b_camera_override, width, height, "");
 
   /* get buffer parameters */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-
-  if (!buffer_params.denoising_data_pass) {
-    session_params.denoising.use = false;
-  }
-
-  session->set_denoising(session_params.denoising);
-
-  /* Update film if denoising data was enabled or disabled. */
-  scene->film->set_denoising_data_pass(buffer_params.denoising_data_pass);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_v3d, b_rv3d, scene->camera, width, height);
 
   /* reset if needed */
   if (scene->need_reset()) {
-    session->reset(buffer_params, session_params.samples);
+    session->reset(session_params, buffer_params);
 
     /* After session reset, so device is not accessing image data anymore. */
     builtin_images_load();
@@ -839,7 +847,41 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
   session->start();
 }
 
-bool BlenderSession::draw(int w, int h)
+void BlenderSession::draw(BL::SpaceImageEditor &space_image)
+{
+  if (!session || !session->scene) {
+    /* Offline render drawing does not force the render engine update, which means it's possible
+     * that the Session is not created yet. */
+    return;
+  }
+
+  thread_scoped_lock lock(draw_state_.mutex);
+
+  const int pass_index = space_image.image_user().multilayer_pass();
+  if (pass_index != draw_state_.last_pass_index) {
+    BL::RenderPass b_display_pass(b_engine.pass_by_index_get(b_rlay_name.c_str(), pass_index));
+    if (!b_display_pass) {
+      return;
+    }
+
+    Scene *scene = session->scene;
+
+    thread_scoped_lock lock(scene->mutex);
+
+    const Pass *pass = Pass::find(scene->passes, b_display_pass.name());
+    if (!pass) {
+      return;
+    }
+
+    scene->film->set_display_pass(pass->get_type());
+
+    draw_state_.last_pass_index = pass_index;
+  }
+
+  session->draw();
+}
+
+void BlenderSession::view_draw(int w, int h)
 {
   /* pause in redraw in case update is not being called due to final render */
   session->set_pause(BlenderSync::get_session_pause(b_scene, background));
@@ -885,14 +927,14 @@ bool BlenderSession::draw(int w, int h)
 
     /* reset if requested */
     if (reset) {
-      SessionParams session_params = BlenderSync::get_session_params(
+      const SessionParams session_params = BlenderSync::get_session_params(
           b_engine, b_userpref, b_scene, background);
-      BufferParams buffer_params = BlenderSync::get_buffer_params(
-          b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-      bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+      const BufferParams buffer_params = BlenderSync::get_buffer_params(
+          b_v3d, b_rv3d, scene->camera, width, height);
+      const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
       if (session_pause == false) {
-        session->reset(buffer_params, session_params.samples);
+        session->reset(session_params, buffer_params);
         start_resize_time = 0.0;
       }
     }
@@ -905,18 +947,7 @@ bool BlenderSession::draw(int w, int h)
   update_status_progress();
 
   /* draw */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use);
-  DeviceDrawParams draw_params;
-
-  if (session->params.display_buffer_linear) {
-    draw_params.bind_display_space_shader_cb = function_bind(
-        &BL::RenderEngine::bind_display_space_shader, &b_engine, b_scene);
-    draw_params.unbind_display_space_shader_cb = function_bind(
-        &BL::RenderEngine::unbind_display_space_shader, &b_engine);
-  }
-
-  return !session->draw(buffer_params, draw_params);
+  session->draw();
 }
 
 void BlenderSession::get_status(string &status, string &substatus)
@@ -924,11 +955,6 @@ void BlenderSession::get_status(string &status, string &substatus)
   session->progress.get_status(status, substatus);
 }
 
-void BlenderSession::get_kernel_status(string &kernel_status)
-{
-  session->progress.get_kernel_status(kernel_status);
-}
-
 void BlenderSession::get_progress(float &progress, double &total_time, double &render_time)
 {
   session->progress.get_time(total_time, render_time);
@@ -947,7 +973,7 @@ void BlenderSession::update_bake_progress()
 
 void BlenderSession::update_status_progress()
 {
-  string timestatus, status, substatus, kernel_status;
+  string timestatus, status, substatus;
   string scene_status = "";
   float progress;
   double total_time, remaining_time = 0, render_time;
@@ -955,7 +981,6 @@ void BlenderSession::update_status_progress()
   float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;
 
   get_status(status, substatus);
-  get_kernel_status(kernel_status);
   get_progress(progress, total_time, render_time);
 
   if (progress > 0)
@@ -980,14 +1005,12 @@ void BlenderSession::update_status_progress()
       status = " | " + status;
     if (substatus.size() > 0)
       status += " | " + substatus;
-    if (kernel_status.size() > 0)
-      status += " | " + kernel_status;
   }
 
   double current_time = time_dt();
-  /* When rendering in a window, redraw the status at least once per second to keep the elapsed and
-   * remaining time up-to-date. For headless rendering, only report when something significant
-   * changes to keep the console output readable. */
+  /* When rendering in a window, redraw the status at least once per second to keep the elapsed
+   * and remaining time up-to-date. For headless rendering, only report when something
+   * significant changes to keep the console output readable. */
   if (status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
     b_engine.update_stats("", (timestatus + scene_status + status).c_str());
     b_engine.update_memory_stats(mem_used, mem_peak);
@@ -1048,56 +1071,6 @@ void BlenderSession::test_cancel()
       session->progress.set_cancel("Cancelled");
 }
 
-void BlenderSession::update_resumable_tile_manager(int num_samples)
-{
-  const int num_resumable_chunks = BlenderSession::num_resumable_chunks,
-            current_resumable_chunk = BlenderSession::current_resumable_chunk;
-  if (num_resumable_chunks == 0) {
-    return;
-  }
-
-  if (num_resumable_chunks > num_samples) {
-    fprintf(stderr,
-            "Cycles warning: more sample chunks (%d) than samples (%d), "
-            "this will cause some samples to be included in multiple chunks.\n",
-            num_resumable_chunks,
-            num_samples);
-  }
-
-  const float num_samples_per_chunk = (float)num_samples / num_resumable_chunks;
-
-  float range_start_sample, range_num_samples;
-  if (current_resumable_chunk != 0) {
-    /* Single chunk rendering. */
-    range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-    range_num_samples = num_samples_per_chunk;
-  }
-  else {
-    /* Ranged-chunks. */
-    const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
-    range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
-    range_num_samples = num_chunks * num_samples_per_chunk;
-  }
-
-  /* Round after doing the multiplications with num_chunks and num_samples_per_chunk
-   * to allow for many small chunks. */
-  int rounded_range_start_sample = (int)floorf(range_start_sample + 0.5f);
-  int rounded_range_num_samples = max((int)floorf(range_num_samples + 0.5f), 1);
-
-  /* Make sure we don't overshoot. */
-  if (rounded_range_start_sample + rounded_range_num_samples > num_samples) {
-    rounded_range_num_samples = num_samples - rounded_range_num_samples;
-  }
-
-  VLOG(1) << "Samples range start is " << range_start_sample << ", "
-          << "number of samples to render is " << range_num_samples;
-
-  scene->integrator->set_start_sample(rounded_range_start_sample);
-
-  session->tile_manager.range_start_sample = rounded_range_start_sample;
-  session->tile_manager.range_num_samples = rounded_range_num_samples;
-}
-
 void BlenderSession::free_blender_memory_if_possible()
 {
   if (!background) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index d967b81c854..cf52359ea5d 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -33,8 +33,6 @@ class BlenderSync;
 class ImageMetaData;
 class Scene;
 class Session;
-class RenderBuffers;
-class RenderTile;
 
 class BlenderSession {
  public:
@@ -62,6 +60,8 @@ class BlenderSession {
   /* offline render */
   void render(BL::Depsgraph &b_depsgraph);
 
+  void render_frame_finish();
+
   void bake(BL::Depsgraph &b_depsgrah,
             BL::Object &b_object,
             const string &pass_type,
@@ -69,24 +69,29 @@ class BlenderSession {
             const int bake_width,
             const int bake_height);
 
-  void write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
-  void write_render_tile(RenderTile &rtile);
-  void read_render_tile(RenderTile &rtile);
+  void write_render_result(BL::RenderLayer &b_rlay);
+  void write_render_tile();
+
+  void update_render_tile();
+
+  void full_buffer_written(string_view filename);
 
   /* update functions are used to update display buffer only after sample was rendered
    * only needed for better visual feedback */
-  void update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
-  void update_render_tile(RenderTile &rtile, bool highlight);
+  void update_render_result(BL::RenderLayer &b_rlay);
+
+  /* read functions for baking input */
+  void read_render_tile();
 
   /* interactive updates */
   void synchronize(BL::Depsgraph &b_depsgraph);
 
   /* drawing */
-  bool draw(int w, int h);
+  void draw(BL::SpaceImageEditor &space_image);
+  void view_draw(int w, int h);
   void tag_redraw();
   void tag_update();
   void get_status(string &status, string &substatus);
-  void get_kernel_status(string &kernel_status);
   void get_progress(float &progress, double &total_time, double &render_time);
   void test_cancel();
   void update_status_progress();
@@ -123,6 +128,8 @@ class BlenderSession {
 
   void *python_thread_state;
 
+  bool use_developer_ui;
+
   /* Global state which is common for all render sessions created from Blender.
    * Usually denotes command line arguments.
    */
@@ -134,41 +141,25 @@ class BlenderSession {
    */
   static bool headless;
 
-  /* ** Resumable render ** */
-
-  /* Overall number of chunks in which the sample range is to be divided. */
-  static int num_resumable_chunks;
-
-  /* Current resumable chunk index to render. */
-  static int current_resumable_chunk;
-
-  /* Alternative to single-chunk rendering to render a range of chunks. */
-  static int start_resumable_chunk;
-  static int end_resumable_chunk;
-
   static bool print_render_stats;
 
  protected:
   void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);
 
-  void do_write_update_render_result(BL::RenderLayer &b_rlay,
-                                     RenderTile &rtile,
-                                     bool do_update_only);
-  void do_write_update_render_tile(RenderTile &rtile,
-                                   bool do_update_only,
-                                   bool do_read_only,
-                                   bool highlight);
-
   void builtin_images_load();
 
-  /* Update tile manager to reflect resumable render settings. */
-  void update_resumable_tile_manager(int num_samples);
-
   /* Is used after each render layer synchronization is done with the goal
    * of freeing render engine data which is held from Blender side (for
    * example, dependency graph).
    */
   void free_blender_memory_if_possible();
+
+  struct {
+    thread_mutex mutex;
+    int last_pass_index = -1;
+  } draw_state_;
+
+  vector<string> full_buffer_files_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index de7b2761d00..8c4f789ffd0 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -17,6 +17,7 @@
 #include "render/background.h"
 #include "render/colorspace.h"
 #include "render/graph.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/nodes.h"
 #include "render/osl.h"
@@ -475,17 +476,11 @@ static ShaderNode *add_node(Scene *scene,
     SubsurfaceScatteringNode *subsurface = graph->create_node<SubsurfaceScatteringNode>();
 
     switch (b_subsurface_node.falloff()) {
-      case BL::ShaderNodeSubsurfaceScattering::falloff_CUBIC:
-        subsurface->set_falloff(CLOSURE_BSSRDF_CUBIC_ID);
-        break;
-      case BL::ShaderNodeSubsurfaceScattering::falloff_GAUSSIAN:
-        subsurface->set_falloff(CLOSURE_BSSRDF_GAUSSIAN_ID);
-        break;
-      case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY:
-        subsurface->set_falloff(CLOSURE_BSSRDF_BURLEY_ID);
+      case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK_FIXED_RADIUS:
+        subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
         break;
       case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK:
-        subsurface->set_falloff(CLOSURE_BSSRDF_RANDOM_WALK_ID);
+        subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
         break;
     }
 
@@ -597,11 +592,11 @@ static ShaderNode *add_node(Scene *scene,
         break;
     }
     switch (b_principled_node.subsurface_method()) {
-      case BL::ShaderNodeBsdfPrincipled::subsurface_method_BURLEY:
-        principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_ID);
+      case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK_FIXED_RADIUS:
+        principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
         break;
       case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK:
-        principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+        principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
         break;
     }
     node = principled;
@@ -1360,10 +1355,11 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
 void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all)
 {
   Background *background = scene->background;
+  Integrator *integrator = scene->integrator;
 
   BL::World b_world = b_scene.world();
 
-  BlenderViewportParameters new_viewport_parameters(b_v3d);
+  BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
 
   if (world_recalc || update_all || b_world.ptr.data != world_map ||
       viewport_parameters.shader_modified(new_viewport_parameters)) {
@@ -1455,9 +1451,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
       /* AO */
       BL::WorldLighting b_light = b_world.light_settings();
 
-      background->set_use_ao(b_light.use_ambient_occlusion());
-      background->set_ao_factor(b_light.ao_factor());
-      background->set_ao_distance(b_light.distance());
+      integrator->set_ao_factor(b_light.ao_factor());
+      integrator->set_ao_distance(b_light.distance());
 
       /* visibility */
       PointerRNA cvisibility = RNA_pointer_get(&b_world.ptr, "cycles_visibility");
@@ -1472,9 +1467,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
       background->set_visibility(visibility);
     }
     else {
-      background->set_use_ao(false);
-      background->set_ao_factor(0.0f);
-      background->set_ao_distance(FLT_MAX);
+      integrator->set_ao_factor(1.0f);
+      integrator->set_ao_distance(10.0f);
     }
 
     shader->set_graph(graph);
@@ -1496,7 +1490,6 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
 
   background->set_use_shader(view_layer.use_background_shader ||
                              viewport_parameters.use_custom_shader());
-  background->set_use_ao(background->get_use_ao() && view_layer.use_background_ao);
 
   background->tag_update(scene);
 }
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 26d64b7bf85..d6fc7ee1723 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -53,6 +53,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
                          BL::Scene &b_scene,
                          Scene *scene,
                          bool preview,
+                         bool use_developer_ui,
                          Progress &progress)
     : b_engine(b_engine),
       b_data(b_data),
@@ -68,6 +69,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
       scene(scene),
       preview(preview),
       experimental(false),
+      use_developer_ui(use_developer_ui),
       dicing_rate(1.0f),
       max_subdivisions(12),
       progress(progress),
@@ -224,7 +226,7 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
   }
 
   if (b_v3d) {
-    BlenderViewportParameters new_viewport_parameters(b_v3d);
+    BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
 
     if (viewport_parameters.shader_modified(new_viewport_parameters)) {
       world_recalc = true;
@@ -251,9 +253,13 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
 
+  /* TODO(sergey): This feels weak to pass view layer to the integrator, and even weaker to have an
+   * implicit check on whether it is a background render or not. What is the nicer thing here? */
+  const bool background = !b_v3d;
+
   sync_view_layer(b_view_layer);
-  sync_integrator();
-  sync_film(b_v3d);
+  sync_integrator(b_view_layer, background);
+  sync_film(b_view_layer, b_v3d);
   sync_shaders(b_depsgraph, b_v3d);
   sync_images();
 
@@ -280,7 +286,7 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
 
 /* Integrator */
 
-void BlenderSync::sync_integrator()
+void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
@@ -328,59 +334,24 @@ void BlenderSync::sync_integrator()
     integrator->set_motion_blur(view_layer.use_motion_blur);
   }
 
-  integrator->set_method((Integrator::Method)get_enum(
-      cscene, "progressive", Integrator::NUM_METHODS, Integrator::PATH));
-
-  integrator->set_sample_all_lights_direct(get_boolean(cscene, "sample_all_lights_direct"));
-  integrator->set_sample_all_lights_indirect(get_boolean(cscene, "sample_all_lights_indirect"));
   integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold"));
 
   SamplingPattern sampling_pattern = (SamplingPattern)get_enum(
       cscene, "sampling_pattern", SAMPLING_NUM_PATTERNS, SAMPLING_PATTERN_SOBOL);
-
-  int adaptive_min_samples = INT_MAX;
-
-  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
-    sampling_pattern = SAMPLING_PATTERN_PMJ;
-    adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
-    integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
-  }
-  else {
-    integrator->set_adaptive_threshold(0.0f);
-  }
-
   integrator->set_sampling_pattern(sampling_pattern);
 
-  int diffuse_samples = get_int(cscene, "diffuse_samples");
-  int glossy_samples = get_int(cscene, "glossy_samples");
-  int transmission_samples = get_int(cscene, "transmission_samples");
-  int ao_samples = get_int(cscene, "ao_samples");
-  int mesh_light_samples = get_int(cscene, "mesh_light_samples");
-  int subsurface_samples = get_int(cscene, "subsurface_samples");
-  int volume_samples = get_int(cscene, "volume_samples");
-
-  if (get_boolean(cscene, "use_square_samples")) {
-    integrator->set_diffuse_samples(diffuse_samples * diffuse_samples);
-    integrator->set_glossy_samples(glossy_samples * glossy_samples);
-    integrator->set_transmission_samples(transmission_samples * transmission_samples);
-    integrator->set_ao_samples(ao_samples * ao_samples);
-    integrator->set_mesh_light_samples(mesh_light_samples * mesh_light_samples);
-    integrator->set_subsurface_samples(subsurface_samples * subsurface_samples);
-    integrator->set_volume_samples(volume_samples * volume_samples);
-    adaptive_min_samples = min(adaptive_min_samples * adaptive_min_samples, INT_MAX);
+  if (preview) {
+    integrator->set_use_adaptive_sampling(
+        RNA_boolean_get(&cscene, "use_preview_adaptive_sampling"));
+    integrator->set_adaptive_threshold(get_float(cscene, "preview_adaptive_threshold"));
+    integrator->set_adaptive_min_samples(get_int(cscene, "preview_adaptive_min_samples"));
   }
   else {
-    integrator->set_diffuse_samples(diffuse_samples);
-    integrator->set_glossy_samples(glossy_samples);
-    integrator->set_transmission_samples(transmission_samples);
-    integrator->set_ao_samples(ao_samples);
-    integrator->set_mesh_light_samples(mesh_light_samples);
-    integrator->set_subsurface_samples(subsurface_samples);
-    integrator->set_volume_samples(volume_samples);
+    integrator->set_use_adaptive_sampling(RNA_boolean_get(&cscene, "use_adaptive_sampling"));
+    integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
+    integrator->set_adaptive_min_samples(get_int(cscene, "adaptive_min_samples"));
   }
 
-  integrator->set_adaptive_min_samples(adaptive_min_samples);
-
   if (get_boolean(cscene, "use_fast_gi")) {
     if (preview) {
       integrator->set_ao_bounces(get_int(cscene, "ao_bounces"));
@@ -393,20 +364,38 @@ void BlenderSync::sync_integrator()
     integrator->set_ao_bounces(0);
   }
 
-  /* UPDATE_NONE as we don't want to tag the integrator as modified, just tag dependent things */
+  const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background);
+  integrator->set_use_denoise(denoise_params.use);
+
+  /* Only update denoiser parameters if the denoiser is actually used. This allows to tweak
+   * denoiser parameters before enabling it without render resetting on every change. The downside
+   * is that the interface and the integrator are technically out of sync. */
+  if (denoise_params.use) {
+    integrator->set_denoiser_type(denoise_params.type);
+    integrator->set_denoise_start_sample(denoise_params.start_sample);
+    integrator->set_use_denoise_pass_albedo(denoise_params.use_pass_albedo);
+    integrator->set_use_denoise_pass_normal(denoise_params.use_pass_normal);
+    integrator->set_denoiser_prefilter(denoise_params.prefilter);
+  }
+
+  /* UPDATE_NONE as we don't want to tag the integrator as modified (this was done by the
+   * set calls above), but we need to make sure that the dependent things are tagged. */
   integrator->tag_update(scene, Integrator::UPDATE_NONE);
 }
 
 /* Film */
 
-void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
+void BlenderSync::sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
   Film *film = scene->film;
 
   if (b_v3d) {
-    film->set_display_pass(update_viewport_display_passes(b_v3d, scene->passes));
+    const BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
+    film->set_display_pass(new_viewport_parameters.display_pass);
+    film->set_show_active_pixels(new_viewport_parameters.show_active_pixels);
   }
 
   film->set_exposure(get_float(cscene, "film_exposure"));
@@ -434,6 +423,15 @@ void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
         break;
     }
   }
+
+  /* Blender viewport does not support proper shadow catcher compositing, so force an approximate
+   * mode to improve visual feedback. */
+  if (b_v3d) {
+    film->set_use_approximate_shadow_catcher(true);
+  }
+  else {
+    film->set_use_approximate_shadow_catcher(!get_boolean(crl, "use_pass_shadow_catcher"));
+  }
 }
 
 /* Render Layer */
@@ -444,7 +442,6 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
 
   /* Filter. */
   view_layer.use_background_shader = b_view_layer.use_sky();
-  view_layer.use_background_ao = b_view_layer.use_ao();
   /* Always enable surfaces for baking, otherwise there is nothing to bake to. */
   view_layer.use_surfaces = b_view_layer.use_solid() || scene->bake_manager->get_baking();
   view_layer.use_hair = b_view_layer.use_strand();
@@ -464,10 +461,7 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
 
   if (use_layer_samples != 2) {
     int samples = b_view_layer.samples();
-    if (get_boolean(cscene, "use_square_samples"))
-      view_layer.samples = samples * samples;
-    else
-      view_layer.samples = samples;
+    view_layer.samples = samples;
   }
 }
 
@@ -499,7 +493,8 @@ void BlenderSync::sync_images()
 }
 
 /* Passes */
-PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
+
+static PassType get_blender_pass_type(BL::RenderPass &b_pass)
 {
   string name = b_pass.name();
 #define MAP_PASS(passname, passtype) \
@@ -507,10 +502,15 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
     return passtype; \
   } \
   ((void)0)
+
   /* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+
   MAP_PASS("Combined", PASS_COMBINED);
+  MAP_PASS("Noisy Image", PASS_COMBINED);
+
   MAP_PASS("Depth", PASS_DEPTH);
   MAP_PASS("Mist", PASS_MIST);
+  MAP_PASS("Position", PASS_POSITION);
   MAP_PASS("Normal", PASS_NORMAL);
   MAP_PASS("IndexOB", PASS_OBJECT_ID);
   MAP_PASS("UV", PASS_UV);
@@ -539,118 +539,92 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE);
   MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL);
 
+  MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL);
+  MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO);
+
+  MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER);
+  MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER);
+
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+
   MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
   MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
+
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
+
 #undef MAP_PASS
 
   return PASS_NONE;
 }
 
-int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
+static Pass *pass_add(Scene *scene,
+                      PassType type,
+                      const char *name,
+                      PassMode mode = PassMode::DENOISED)
 {
-  string name = b_pass.name();
+  Pass *pass = scene->create_node<Pass>();
 
-  if (name == "Noisy Image")
-    return DENOISING_PASS_PREFILTERED_COLOR;
+  pass->set_type(type);
+  pass->set_name(ustring(name));
+  pass->set_mode(mode);
 
-  if (name.substr(0, 10) != "Denoising ") {
-    return -1;
-  }
-  name = name.substr(10);
-
-#define MAP_PASS(passname, offset) \
-  if (name == passname) { \
-    return offset; \
-  } \
-  ((void)0)
-  MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL);
-  MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO);
-  MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH);
-  MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING);
-  MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE);
-  MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY);
-  MAP_PASS("Clean", DENOISING_PASS_CLEAN);
-#undef MAP_PASS
-
-  return -1;
+  return pass;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
-                                             BL::RenderLayer &b_rlay,
-                                             BL::ViewLayer &b_view_layer,
-                                             bool adaptive_sampling,
-                                             const DenoiseParams &denoising)
+void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
 {
-  vector<Pass> passes;
+  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+  /* Delete all existing passes. */
+  set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end());
+  scene->delete_nodes(clear_passes);
 
-  /* loop over passes */
+  /* Always add combined pass. */
+  pass_add(scene, PASS_COMBINED, "Combined");
+
+  /* Blender built-in data and light passes. */
   for (BL::RenderPass &b_pass : b_rlay.passes) {
-    PassType pass_type = get_pass_type(b_pass);
+    const PassType pass_type = get_blender_pass_type(b_pass);
+
+    if (pass_type == PASS_NONE) {
+      LOG(ERROR) << "Unknown pass " << b_pass.name();
+      continue;
+    }
 
     if (pass_type == PASS_MOTION &&
         (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
       continue;
     }
-    if (pass_type != PASS_NONE)
-      Pass::add(pass_type, passes, b_pass.name().c_str());
-  }
-
-  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
-  int denoising_flags = 0;
-  if (denoising.use || denoising.store_passes) {
-    if (denoising.type == DENOISER_NLM) {
-#define MAP_OPTION(name, flag) \
-  if (!get_boolean(crl, name)) { \
-    denoising_flags |= flag; \
-  } \
-  ((void)0)
-      MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
-      MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
-      MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
-      MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
-      MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
-      MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
-#undef MAP_OPTION
-    }
-    b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+    pass_add(scene, pass_type, b_pass.name().c_str());
   }
-  scene->film->set_denoising_flags(denoising_flags);
-
-  if (denoising.store_passes) {
-    b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
-    if (denoising.type == DENOISER_NLM) {
-      b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
-      b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
-      b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
-    }
 
-    if (scene->film->get_denoising_flags() & DENOISING_CLEAN_ALL_PASSES) {
-      b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str());
-    }
-  }
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
+  /* Debug passes. */
   if (get_boolean(crl, "pass_debug_render_time")) {
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
+    pass_add(scene, PASS_RENDER_TIME, "Debug Render Time");
   }
   if (get_boolean(crl, "pass_debug_sample_count")) {
     b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+    pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count");
   }
+
+  /* Cycles specific passes. */
   if (get_boolean(crl, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
+    pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir");
   }
   if (get_boolean(crl, "use_pass_volume_indirect")) {
     b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd");
+    pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd");
+  }
+  if (get_boolean(crl, "use_pass_shadow_catcher")) {
+    b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+    pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher");
   }
 
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
@@ -662,7 +636,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Object%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT);
   }
@@ -670,7 +644,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Material%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL);
   }
@@ -678,22 +652,33 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET);
   }
-  if (b_view_layer.use_pass_cryptomatte_accurate() && cryptomatte_passes != CRYPT_NONE) {
-    cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ACCURATE);
-  }
   scene->film->set_cryptomatte_passes(cryptomatte_passes);
 
-  if (adaptive_sampling) {
-    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
-    if (!get_boolean(crl, "pass_debug_sample_count")) {
-      Pass::add(PASS_SAMPLE_COUNT, passes);
+  /* Denoising passes. */
+  const bool use_denoising = get_boolean(cscene, "use_denoising") &&
+                             get_boolean(crl, "use_denoising");
+  const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes");
+  if (use_denoising) {
+    b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+    pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY);
+    if (get_boolean(crl, "use_pass_shadow_catcher")) {
+      b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+      pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY);
     }
   }
+  if (store_denoising_passes) {
+    b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
+    pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY);
+
+    b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
+    pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY);
+  }
 
+  /* Custom AOV passes. */
   BL::ViewLayer::aovs_iterator b_aov_iter;
   for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) {
     BL::AOV b_aov(*b_aov_iter);
@@ -706,28 +691,15 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
 
     if (is_color) {
       b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_AOV_COLOR, passes, name.c_str());
+      pass_add(scene, PASS_AOV_COLOR, name.c_str());
     }
     else {
       b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str());
-      Pass::add(PASS_AOV_VALUE, passes, name.c_str());
+      pass_add(scene, PASS_AOV_VALUE, name.c_str());
     }
   }
 
-  scene->film->set_denoising_data_pass(denoising.use || denoising.store_passes);
-  scene->film->set_denoising_clean_pass(scene->film->get_denoising_flags() &
-                                        DENOISING_CLEAN_ALL_PASSES);
-  scene->film->set_denoising_prefiltered_pass(denoising.store_passes &&
-                                              denoising.type == DENOISER_NLM);
   scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold());
-
-  if (!Pass::equals(passes, scene->passes)) {
-    scene->film->tag_passes_update(scene, passes);
-    scene->film->tag_modified();
-    scene->integrator->tag_update(scene, Integrator::UPDATE_ALL);
-  }
-
-  return passes;
 }
 
 void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
@@ -773,9 +745,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
     params.shadingsystem = SHADINGSYSTEM_OSL;
 
   if (background || DebugFlags().viewport_static_bvh)
-    params.bvh_type = SceneParams::BVH_STATIC;
+    params.bvh_type = BVH_TYPE_STATIC;
   else
-    params.bvh_type = SceneParams::BVH_DYNAMIC;
+    params.bvh_type = BVH_TYPE_DYNAMIC;
 
   params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
   params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
@@ -818,8 +790,7 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background)
 SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
                                               BL::Preferences &b_preferences,
                                               BL::Scene &b_scene,
-                                              bool background,
-                                              BL::ViewLayer b_view_layer)
+                                              bool background)
 {
   SessionParams params;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -827,7 +798,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   /* feature set */
   params.experimental = (get_enum(cscene, "feature_set") != 0);
 
-  /* Background */
+  /* Headless and background rendering. */
+  params.headless = BlenderSession::headless;
   params.background = background;
 
   /* Device */
@@ -836,111 +808,26 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* samples */
   int samples = get_int(cscene, "samples");
-  int aa_samples = get_int(cscene, "aa_samples");
   int preview_samples = get_int(cscene, "preview_samples");
-  int preview_aa_samples = get_int(cscene, "preview_aa_samples");
 
-  if (get_boolean(cscene, "use_square_samples")) {
-    aa_samples = aa_samples * aa_samples;
-    preview_aa_samples = preview_aa_samples * preview_aa_samples;
-
-    samples = samples * samples;
-    preview_samples = preview_samples * preview_samples;
-  }
-
-  if (get_enum(cscene, "progressive") == 0 && params.device.has_branched_path) {
-    if (background) {
-      params.samples = aa_samples;
-    }
-    else {
-      params.samples = preview_aa_samples;
-      if (params.samples == 0)
-        params.samples = INT_MAX;
-    }
+  if (background) {
+    params.samples = samples;
   }
   else {
-    if (background) {
-      params.samples = samples;
-    }
-    else {
-      params.samples = preview_samples;
-      if (params.samples == 0)
-        params.samples = INT_MAX;
-    }
+    params.samples = preview_samples;
+    if (params.samples == 0)
+      params.samples = INT_MAX;
   }
 
   /* Clamp samples. */
   params.samples = min(params.samples, Integrator::MAX_SAMPLES);
 
-  /* Adaptive sampling. */
-  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
-
-  /* tiles */
-  const bool is_cpu = (params.device.type == DEVICE_CPU);
-  if (!is_cpu && !background) {
-    /* currently GPU could be much slower than CPU when using tiles,
-     * still need to be investigated, but meanwhile make it possible
-     * to work in viewport smoothly
-     */
-    int debug_tile_size = get_int(cscene, "debug_tile_size");
-
-    params.tile_size = make_int2(debug_tile_size, debug_tile_size);
-  }
-  else {
-    int tile_x = b_engine.tile_x();
-    int tile_y = b_engine.tile_y();
-
-    params.tile_size = make_int2(tile_x, tile_y);
-  }
-
-  if ((BlenderSession::headless == false) && background) {
-    params.tile_order = (TileOrder)get_enum(cscene, "tile_order");
-  }
-  else {
-    params.tile_order = TILE_BOTTOM_TO_TOP;
-  }
-
-  /* Denoising */
-  params.denoising = get_denoise_params(b_scene, b_view_layer, background);
-
-  if (params.denoising.use) {
-    /* Add additional denoising devices if we are rendering and denoising
-     * with different devices. */
-    params.device.add_denoising_devices(params.denoising.type);
-
-    /* Check if denoiser is supported by device. */
-    if (!(params.device.denoisers & params.denoising.type)) {
-      params.denoising.use = false;
-    }
-  }
-
   /* Viewport Performance */
-  params.start_resolution = get_int(cscene, "preview_start_resolution");
   params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
-  /* other parameters */
-  params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout");
-  params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout");
-  params.text_timeout = (double)get_float(cscene, "debug_text_timeout");
-
-  /* progressive refine */
-  BL::RenderSettings b_r = b_scene.render();
-  params.progressive_refine = b_engine.is_preview() ||
-                              get_boolean(cscene, "use_progressive_refine");
-  if (b_r.use_save_buffers() || params.adaptive_sampling)
-    params.progressive_refine = false;
-
   if (background) {
-    if (params.progressive_refine)
-      params.progressive = true;
-    else
-      params.progressive = false;
-
-    params.start_resolution = INT_MAX;
     params.pixel_size = 1;
   }
-  else
-    params.progressive = true;
 
   /* shading system - scene level needs full refresh */
   const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system");
@@ -950,19 +837,30 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   else if (shadingsystem == 1)
     params.shadingsystem = SHADINGSYSTEM_OSL;
 
-  /* Color management. */
-  params.display_buffer_linear = b_engine.support_display_space_shader(b_scene);
-
-  if (b_engine.is_preview()) {
-    /* For preview rendering we're using same timeout as
-     * blender's job update.
-     */
-    params.progressive_update_timeout = 0.1;
+  /* Time limit. */
+  if (background) {
+    params.time_limit = get_float(cscene, "time_limit");
+  }
+  else {
+    /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is
+     * usually higher than acceptable level for the final frame. */
+    /* TODO: It might be useful to support time limit in the viewport as well, but needs some
+     * extra thoughts and input. */
+    params.time_limit = 0.0;
   }
 
+  /* Profiling. */
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  if (background) {
+    params.use_auto_tile = RNA_boolean_get(&cscene, "use_auto_tile");
+    params.tile_size = get_int(cscene, "tile_size");
+  }
+  else {
+    params.use_auto_tile = false;
+  }
+
   return params;
 }
 
@@ -970,33 +868,34 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
                                               BL::ViewLayer &b_view_layer,
                                               bool background)
 {
+  enum DenoiserInput {
+    DENOISER_INPUT_RGB = 1,
+    DENOISER_INPUT_RGB_ALBEDO = 2,
+    DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
+
+    DENOISER_INPUT_NUM,
+  };
+
   DenoiseParams denoising;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
+  int input_passes = -1;
+
   if (background) {
     /* Final Render Denoising */
     denoising.use = get_boolean(cscene, "use_denoising");
     denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.prefilter = (DenoiserPrefilter)get_enum(
+        cscene, "denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_NONE);
+
+    input_passes = (DenoiserInput)get_enum(
+        cscene, "denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO_NORMAL);
 
     if (b_view_layer) {
       PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles");
       if (!get_boolean(clayer, "use_denoising")) {
         denoising.use = false;
       }
-
-      denoising.radius = get_int(clayer, "denoising_radius");
-      denoising.strength = get_float(clayer, "denoising_strength");
-      denoising.feature_strength = get_float(clayer, "denoising_feature_strength");
-      denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca");
-
-      denoising.input_passes = (DenoiserInput)get_enum(
-          clayer,
-          (denoising.type == DENOISER_OPTIX) ? "denoising_optix_input_passes" :
-                                               "denoising_openimagedenoise_input_passes",
-          DENOISER_INPUT_NUM,
-          DENOISER_INPUT_RGB_ALBEDO_NORMAL);
-
-      denoising.store_passes = get_boolean(clayer, "denoising_store_passes");
     }
   }
   else {
@@ -1004,10 +903,12 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
     denoising.use = get_boolean(cscene, "use_preview_denoising");
     denoising.type = (DenoiserType)get_enum(
         cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.prefilter = (DenoiserPrefilter)get_enum(
+        cscene, "preview_denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_FAST);
     denoising.start_sample = get_int(cscene, "preview_denoising_start_sample");
 
-    denoising.input_passes = (DenoiserInput)get_enum(
-        cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, (int)denoising.input_passes);
+    input_passes = (DenoiserInput)get_enum(
+        cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO);
 
     /* Auto select fastest denoiser. */
     if (denoising.type == DENOISER_NONE) {
@@ -1023,6 +924,27 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
     }
   }
 
+  switch (input_passes) {
+    case DENOISER_INPUT_RGB:
+      denoising.use_pass_albedo = false;
+      denoising.use_pass_normal = false;
+      break;
+
+    case DENOISER_INPUT_RGB_ALBEDO:
+      denoising.use_pass_albedo = true;
+      denoising.use_pass_normal = false;
+      break;
+
+    case DENOISER_INPUT_RGB_ALBEDO_NORMAL:
+      denoising.use_pass_albedo = true;
+      denoising.use_pass_normal = true;
+      break;
+
+    default:
+      LOG(ERROR) << "Unhandled input passes enum " << input_passes;
+      break;
+  }
+
   return denoising;
 }
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index d25c0ce1bc3..786479ac0f8 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -60,6 +60,7 @@ class BlenderSync {
               BL::Scene &b_scene,
               Scene *scene,
               bool preview,
+              bool use_developer_ui,
               Progress &progress);
   ~BlenderSync();
 
@@ -75,12 +76,8 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::Scene &b_scene,
-                                  BL::RenderLayer &b_render_layer,
-                                  BL::ViewLayer &b_view_layer,
-                                  bool adaptive_sampling,
-                                  const DenoiseParams &denoising);
-  void sync_integrator();
+  void sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  void sync_integrator(BL::ViewLayer &b_view_layer, bool background);
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
                    int width,
@@ -98,22 +95,13 @@ class BlenderSync {
 
   /* get parameters */
   static SceneParams get_scene_params(BL::Scene &b_scene, bool background);
-  static SessionParams get_session_params(
-      BL::RenderEngine &b_engine,
-      BL::Preferences &b_userpref,
-      BL::Scene &b_scene,
-      bool background,
-      BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL));
+  static SessionParams get_session_params(BL::RenderEngine &b_engine,
+                                          BL::Preferences &b_userpref,
+                                          BL::Scene &b_scene,
+                                          bool background);
   static bool get_session_pause(BL::Scene &b_scene, bool background);
-  static BufferParams get_buffer_params(BL::SpaceView3D &b_v3d,
-                                        BL::RegionView3D &b_rv3d,
-                                        Camera *cam,
-                                        int width,
-                                        int height,
-                                        const bool use_denoiser);
-
-  static PassType get_pass_type(BL::RenderPass &b_pass);
-  static int get_denoising_pass(BL::RenderPass &b_pass);
+  static BufferParams get_buffer_params(
+      BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height);
 
  private:
   static DenoiseParams get_denoise_params(BL::Scene &b_scene,
@@ -131,7 +119,7 @@ class BlenderSync {
                    int width,
                    int height,
                    void **python_thread_state);
-  void sync_film(BL::SpaceView3D &b_v3d);
+  void sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d);
   void sync_view();
 
   /* Shader */
@@ -245,6 +233,7 @@ class BlenderSync {
   Scene *scene;
   bool preview;
   bool experimental;
+  bool use_developer_ui;
 
   float dicing_rate;
   int max_subdivisions;
@@ -253,7 +242,6 @@ class BlenderSync {
     RenderLayerInfo()
         : material_override(PointerRNA_NULL),
           use_background_shader(true),
-          use_background_ao(true),
           use_surfaces(true),
           use_hair(true),
           use_volumes(true),
@@ -266,7 +254,6 @@ class BlenderSync {
     string name;
     BL::Material material_override;
     bool use_background_shader;
-    bool use_background_ao;
     bool use_surfaces;
     bool use_hair;
     bool use_volumes;
diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp
index 18bdfc74de0..62e32240bba 100644
--- a/intern/cycles/blender/blender_viewport.cpp
+++ b/intern/cycles/blender/blender_viewport.cpp
@@ -17,6 +17,8 @@
 #include "blender_viewport.h"
 
 #include "blender_util.h"
+#include "render/pass.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -26,11 +28,12 @@ BlenderViewportParameters::BlenderViewportParameters()
       studiolight_rotate_z(0.0f),
       studiolight_intensity(1.0f),
       studiolight_background_alpha(1.0f),
-      display_pass(PASS_COMBINED)
+      display_pass(PASS_COMBINED),
+      show_active_pixels(false)
 {
 }
 
-BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
+BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui)
     : BlenderViewportParameters()
 {
   if (!b_v3d) {
@@ -55,7 +58,25 @@ BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
   }
 
   /* Film. */
-  display_pass = (PassType)get_enum(cshading, "render_pass", -1, -1);
+
+  /* Lookup display pass based on the enum identifier.
+   * This is because integer values of python enum are not aligned with the passes definition in
+   * the kernel. */
+
+  display_pass = PASS_COMBINED;
+
+  const string display_pass_identifier = get_enum_identifier(cshading, "render_pass");
+  if (!display_pass_identifier.empty()) {
+    const ustring pass_type_identifier(string_to_lower(display_pass_identifier));
+    const NodeEnum *pass_type_enum = Pass::get_type_enum();
+    if (pass_type_enum->exists(pass_type_identifier)) {
+      display_pass = static_cast<PassType>((*pass_type_enum)[pass_type_identifier]);
+    }
+  }
+
+  if (use_developer_ui) {
+    show_active_pixels = get_boolean(cshading, "show_active_pixels");
+  }
 }
 
 bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters &other) const
@@ -69,7 +90,7 @@ bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters
 
 bool BlenderViewportParameters::film_modified(const BlenderViewportParameters &other) const
 {
-  return display_pass != other.display_pass;
+  return display_pass != other.display_pass || show_active_pixels != other.show_active_pixels;
 }
 
 bool BlenderViewportParameters::modified(const BlenderViewportParameters &other) const
@@ -82,18 +103,4 @@ bool BlenderViewportParameters::use_custom_shader() const
   return !(use_scene_world && use_scene_lights);
 }
 
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes)
-{
-  if (b_v3d) {
-    const BlenderViewportParameters viewport_parameters(b_v3d);
-    const PassType display_pass = viewport_parameters.display_pass;
-
-    passes.clear();
-    Pass::add(display_pass, passes);
-
-    return display_pass;
-  }
-  return PASS_NONE;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h
index d6518597053..b5adafc30c9 100644
--- a/intern/cycles/blender/blender_viewport.h
+++ b/intern/cycles/blender/blender_viewport.h
@@ -39,9 +39,10 @@ class BlenderViewportParameters {
 
   /* Film. */
   PassType display_pass;
+  bool show_active_pixels;
 
   BlenderViewportParameters();
-  explicit BlenderViewportParameters(BL::SpaceView3D &b_v3d);
+  BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui);
 
   /* Check whether any of shading related settings are different from the given parameters. */
   bool shader_modified(const BlenderViewportParameters &other) const;
@@ -57,8 +58,6 @@ class BlenderViewportParameters {
   bool use_custom_shader() const;
 };
 
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes);
-
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 048c2b95e40..d3497f3a8d8 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -832,18 +832,18 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
   typedef StackAllocator<256, float2> LeafTimeStackAllocator;
   typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
 
-  vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
-  vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
-  vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
-  vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
-  vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
+  vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM];
+  vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM];
+  vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM];
+  vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM];
+  vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM];
 
   /* TODO(sergey): In theory we should be able to store references. */
   vector<BVHReference, LeafReferenceStackAllocator> object_references;
 
-  uint visibility[PRIMITIVE_NUM_TOTAL] = {0};
+  uint visibility[PRIMITIVE_NUM] = {0};
   /* NOTE: Keep initialization in sync with actual number of primitives. */
-  BoundBox bounds[PRIMITIVE_NUM_TOTAL] = {
+  BoundBox bounds[PRIMITIVE_NUM] = {
       BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty};
   int ob_num = 0;
   int num_new_prims = 0;
@@ -877,7 +877,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
    * TODO(sergey): With some pointer trickery we can write directly to the
    * destination buffers for the non-spatial split BVH.
    */
-  BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL};
+  BVHNode *leaves[PRIMITIVE_NUM + 1] = {NULL};
   int num_leaves = 0;
   size_t start_index = 0;
   vector<int, LeafStackAllocator> local_prim_type, local_prim_index, local_prim_object;
@@ -888,7 +888,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
   if (need_prim_time) {
     local_prim_time.resize(num_new_prims);
   }
-  for (int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
+  for (int i = 0; i < PRIMITIVE_NUM; ++i) {
     int num = (int)p_type[i].size();
     if (num != 0) {
       assert(p_type[i].size() == p_index[i].size());
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 62f543941a9..96852510b63 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -37,10 +37,10 @@
 /* Kernel includes are necessary so that the filter function for Embree can access the packed BVH.
  */
 #  include "kernel/bvh/bvh_embree.h"
-#  include "kernel/kernel_compat_cpu.h"
-#  include "kernel/kernel_globals.h"
+#  include "kernel/bvh/bvh_util.h"
+#  include "kernel/device/cpu/compat.h"
+#  include "kernel/device/cpu/globals.h"
 #  include "kernel/kernel_random.h"
-#  include "kernel/split/kernel_split_data_types.h"
 
 #  include "render/hair.h"
 #  include "render/mesh.h"
@@ -73,46 +73,69 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   const RTCRay *ray = (RTCRay *)args->ray;
   RTCHit *hit = (RTCHit *)args->hit;
   CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  KernelGlobals *kg = ctx->kg;
+  const KernelGlobals *kg = ctx->kg;
 
   switch (ctx->type) {
     case CCLIntersectContext::RAY_SHADOW_ALL: {
-      /* Append the intersection to the end of the array. */
-      if (ctx->num_hits < ctx->max_hits) {
-        Intersection current_isect;
-        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-        for (size_t i = 0; i < ctx->max_hits; ++i) {
+      Intersection current_isect;
+      kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+
+      /* If no transparent shadows, all light is blocked. */
+      const int flags = intersection_get_shader_flags(kg, &current_isect);
+      if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->max_hits == 0) {
+        ctx->opaque_hit = true;
+        return;
+      }
+
+      /* Test if we need to record this transparent intersection. */
+      if (ctx->num_hits < ctx->max_hits || ray->tfar < ctx->max_t) {
+        /* Skip already recorded intersections. */
+        int num_recorded_hits = min(ctx->num_hits, ctx->max_hits);
+
+        for (int i = 0; i < num_recorded_hits; ++i) {
           if (current_isect.object == ctx->isect_s[i].object &&
               current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) {
             /* This intersection was already recorded, skip it. */
             *args->valid = 0;
-            break;
+            return;
           }
         }
-        Intersection *isect = &ctx->isect_s[ctx->num_hits];
-        ++ctx->num_hits;
-        *isect = current_isect;
-        int prim = kernel_tex_fetch(__prim_index, isect->prim);
-        int shader = 0;
-        if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
-          shader = kernel_tex_fetch(__tri_shader, prim);
-        }
-        else {
-          float4 str = kernel_tex_fetch(__curves, prim);
-          shader = __float_as_int(str.z);
-        }
-        int flag = kernel_tex_fetch(__shaders, shader & SHADER_MASK).flags;
-        /* If no transparent shadows, all light is blocked. */
-        if (flag & (SD_HAS_TRANSPARENT_SHADOW)) {
-          /* This tells Embree to continue tracing. */
-          *args->valid = 0;
+
+        /* If maximum number of hits was reached, replace the intersection with the
+         * highest distance. We want to find the N closest intersections. */
+        int isect_index = num_recorded_hits;
+        if (num_recorded_hits + 1 >= ctx->max_hits) {
+          float max_t = ctx->isect_s[0].t;
+          int max_recorded_hit = 0;
+
+          for (int i = 1; i < num_recorded_hits; ++i) {
+            if (ctx->isect_s[i].t > max_t) {
+              max_recorded_hit = i;
+              max_t = ctx->isect_s[i].t;
+            }
+          }
+
+          if (num_recorded_hits >= ctx->max_hits) {
+            isect_index = max_recorded_hit;
+          }
+
+          /* Limit the ray distance and stop counting hits beyond this.
+           * TODO: is there some way we can tell Embree to stop intersecting beyond
+           * this distance when max number of hits is reached?. Or maybe it will
+           * become irrelevant if we make max_hits a very high number on the CPU. */
+          ctx->max_t = max(current_isect.t, max_t);
         }
+
+        ctx->isect_s[isect_index] = current_isect;
       }
-      else {
-        /* Increase the number of hits beyond ray.max_hits
-         * so that the caller can detect this as opaque. */
-        ++ctx->num_hits;
-      }
+
+      /* Always increase the number of hits, even beyond ray.max_hits so that
+       * the caller can detect this as and consider it opaque, or trace another
+       * ray. */
+      ++ctx->num_hits;
+
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
       break;
     }
     case CCLIntersectContext::RAY_LOCAL:
@@ -329,7 +352,7 @@ void BVHEmbree::build(Progress &progress, Stats *stats, RTCDevice rtc_device_)
     scene = NULL;
   }
 
-  const bool dynamic = params.bvh_type == SceneParams::BVH_DYNAMIC;
+  const bool dynamic = params.bvh_type == BVH_TYPE_DYNAMIC;
 
   scene = rtcNewScene(rtc_device);
   const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) |
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 2dc10f30363..31b3971c110 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -31,6 +31,27 @@ CCL_NAMESPACE_BEGIN
  */
 typedef KernelBVHLayout BVHLayout;
 
+/* Type of BVH, in terms whether it is supported dynamic updates of meshes
+ * or whether modifying geometry requires full BVH rebuild.
+ */
+enum BVHType {
+  /* BVH supports dynamic updates of geometry.
+   *
+   * Faster for updating BVH tree when doing modifications in viewport,
+   * but slower for rendering.
+   */
+  BVH_TYPE_DYNAMIC = 0,
+  /* BVH tree is calculated for specific scene, updates in geometry
+   * requires full tree rebuild.
+   *
+   * Slower to update BVH tree when modifying objects in viewport, also
+   * slower to build final BVH tree but gives best possible render speed.
+   */
+  BVH_TYPE_STATIC = 1,
+
+  BVH_NUM_TYPES,
+};
+
 /* Names bitflag type to denote which BVH layouts are supported by
  * particular area.
  *
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 04ff598621a..da259171844 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -287,9 +287,6 @@ if(CYCLES_STANDALONE_REPOSITORY)
   endif()
 
   set(__boost_packages filesystem regex system thread date_time)
-  if(WITH_CYCLES_NETWORK)
-    list(APPEND __boost_packages serialization)
-  endif()
   if(WITH_CYCLES_OSL)
     list(APPEND __boost_packages wave)
   endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 928249931a3..d18f4360aef 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,49 +36,70 @@ endif()
 
 set(SRC
   device.cpp
-  device_cpu.cpp
-  device_cuda.cpp
-  device_denoising.cpp
-  device_dummy.cpp
+  device_denoise.cpp
+  device_graphics_interop.cpp
+  device_kernel.cpp
   device_memory.cpp
-  device_multi.cpp
-  device_opencl.cpp
-  device_optix.cpp
-  device_split_kernel.cpp
-  device_task.cpp
+  device_queue.cpp
+)
+
+set(SRC_CPU
+  cpu/device.cpp
+  cpu/device.h
+  cpu/device_impl.cpp
+  cpu/device_impl.h
+  cpu/kernel.cpp
+  cpu/kernel.h
+  cpu/kernel_function.h
+  cpu/kernel_thread_globals.cpp
+  cpu/kernel_thread_globals.h
 )
 
 set(SRC_CUDA
-  cuda/device_cuda.h
-  cuda/device_cuda_impl.cpp
+  cuda/device.cpp
+  cuda/device.h
+  cuda/device_impl.cpp
+  cuda/device_impl.h
+  cuda/graphics_interop.cpp
+  cuda/graphics_interop.h
+  cuda/kernel.cpp
+  cuda/kernel.h
+  cuda/queue.cpp
+  cuda/queue.h
+  cuda/util.cpp
+  cuda/util.h
 )
 
-set(SRC_OPENCL
-  opencl/device_opencl.h
-  opencl/device_opencl_impl.cpp
-  opencl/memory_manager.h
-  opencl/memory_manager.cpp
-  opencl/opencl_util.cpp
+set(SRC_DUMMY
+  dummy/device.cpp
+  dummy/device.h
 )
 
-if(WITH_CYCLES_NETWORK)
-  list(APPEND SRC
-    device_network.cpp
-  )
-endif()
+set(SRC_MULTI
+  multi/device.cpp
+  multi/device.h
+)
+
+set(SRC_OPTIX
+  optix/device.cpp
+  optix/device.h
+  optix/device_impl.cpp
+  optix/device_impl.h
+  optix/queue.cpp
+  optix/queue.h
+  optix/util.h
+)
 
 set(SRC_HEADERS
   device.h
-  device_denoising.h
+  device_denoise.h
+  device_graphics_interop.h
   device_memory.h
-  device_intern.h
-  device_network.h
-  device_split_kernel.h
-  device_task.h
+  device_kernel.h
+  device_queue.h
 )
 
 set(LIB
-  cycles_render
   cycles_kernel
   cycles_util
   ${CYCLES_GL_LIBRARIES}
@@ -95,15 +116,7 @@ else()
 endif()
 
 add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-if(WITH_CYCLES_DEVICE_OPENCL)
-  list(APPEND LIB
-    extern_clew
-  )
-  add_definitions(-DWITH_OPENCL)
-endif()
+
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
@@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI)
 endif()
 
 if(WITH_OPENIMAGEDENOISE)
-  add_definitions(-DWITH_OPENIMAGEDENOISE)
-  add_definitions(-DOIDN_STATIC_LIB)
-  list(APPEND INC_SYS
-    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
-  )
   list(APPEND LIB
     ${OPENIMAGEDENOISE_LIBRARIES}
-    ${TBB_LIBRARIES}
   )
 endif()
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}"
+  ${SRC}
+  ${SRC_CPU}
+  ${SRC_CUDA}
+  ${SRC_DUMMY}
+  ${SRC_MULTI}
+  ${SRC_OPTIX}
+  ${SRC_HEADERS}
+)
+
+source_group("cpu" FILES ${SRC_CPU})
+source_group("cuda" FILES ${SRC_CUDA})
+source_group("dummy" FILES ${SRC_DUMMY})
+source_group("multi" FILES ${SRC_MULTI})
+source_group("optix" FILES ${SRC_OPTIX})
+source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
new file mode 100644
index 00000000000..68ca8e8bb22
--- /dev/null
+++ b/intern/cycles/device/cpu/device.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device.h"
+#include "device/cpu/device_impl.h"
+
+/* Used for `info.denoisers`. */
+/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their
+ * own class. But until then keep API consistent with how it used to work before. */
+#include "util/util_openimagedenoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new CPUDevice(info, stats, profiler);
+}
+
+void device_cpu_info(vector<DeviceInfo> &devices)
+{
+  DeviceInfo info;
+
+  info.type = DEVICE_CPU;
+  info.description = system_cpu_brand_string();
+  info.id = "CPU";
+  info.num = 0;
+  info.has_osl = true;
+  info.has_half_images = true;
+  info.has_nanovdb = true;
+  info.has_profiling = true;
+  if (openimagedenoise_supported()) {
+    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+  }
+
+  devices.insert(devices.begin(), info);
+}
+
+string device_cpu_capabilities()
+{
+  string capabilities = "";
+  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+  capabilities += system_cpu_support_avx() ? "AVX " : "";
+  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+  if (capabilities[capabilities.size() - 1] == ' ')
+    capabilities.resize(capabilities.size() - 1);
+  return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/device/cpu/device.h
index dcea2630aef..9cb2e80068d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/device/cpu/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,22 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_buffer_update.h"
+#pragma once
 
-#define KERNEL_NAME buffer_update
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+
+string device_cpu_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
new file mode 100644
index 00000000000..3b0db6bdd0e
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device_impl.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "device/device.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "bvh/bvh_embree.h"
+
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+    : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
+   * optimization. */
+  VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+          << " kernels.";
+
+  if (info.cpu_threads == 0) {
+    info.cpu_threads = TaskScheduler::num_threads();
+  }
+
+#ifdef WITH_OSL
+  kernel_globals.osl = &osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  embree_device = rtcNewDevice("verbose=0");
+#endif
+  need_texture_info = false;
+}
+
+CPUDevice::~CPUDevice()
+{
+#ifdef WITH_EMBREE
+  rtcReleaseDevice(embree_device);
+#endif
+
+  texture_info.free();
+}
+
+bool CPUDevice::show_samples() const
+{
+  return (info.cpu_threads == 1);
+}
+
+BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+{
+  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+#ifdef WITH_EMBREE
+  bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+  return bvh_layout_mask;
+}
+
+bool CPUDevice::load_texture_info()
+{
+  if (!need_texture_info) {
+    return false;
+  }
+
+  texture_info.copy_to_device();
+  need_texture_info = false;
+
+  return true;
+}
+
+void CPUDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+    }
+
+    if (mem.type == MEM_DEVICE_ONLY) {
+      assert(!mem.host_pointer);
+      size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+      void *data = util_aligned_malloc(mem.memory_size(), alignment);
+      mem.device_pointer = (device_ptr)data;
+    }
+    else {
+      mem.device_pointer = (device_ptr)mem.host_pointer;
+    }
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
+}
+
+void CPUDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    /* copy is no-op */
+  }
+}
+
+void CPUDevice::mem_copy_from(
+    device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+{
+  /* no-op */
+}
+
+void CPUDevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+
+  if (mem.device_pointer) {
+    memset((void *)mem.device_pointer, 0, mem.memory_size());
+  }
+}
+
+void CPUDevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else if (mem.device_pointer) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      util_aligned_free((void *)mem.device_pointer);
+    }
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+#if WITH_EMBREE
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    // Update scene handle (since it is different for each device on multi devices)
+    KernelData *const data = (KernelData *)host;
+    data->bvh.scene = embree_scene;
+  }
+#endif
+  kernel_const_copy(&kernel_globals, name, host, size);
+}
+
+void CPUDevice::global_alloc(device_memory &mem)
+{
+  VLOG(1) << "Global memory allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+}
+
+void CPUDevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CPUDevice::tex_alloc(device_texture &mem)
+{
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  texture_info[slot] = mem.info;
+  texture_info[slot].data = (uint64_t)mem.host_pointer;
+  need_texture_info = true;
+}
+
+void CPUDevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+    need_texture_info = true;
+  }
+}
+
+void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+#ifdef WITH_EMBREE
+  if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+      bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+    BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+    if (refit) {
+      bvh_embree->refit(progress);
+    }
+    else {
+      bvh_embree->build(progress, &stats, embree_device);
+    }
+
+    if (bvh->params.top_level) {
+      embree_scene = bvh_embree->scene;
+    }
+  }
+  else
+#endif
+    Device::build_bvh(bvh, progress, refit);
+}
+
+#if 0
+void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+{
+  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+  scoped_timer timer(&tile.buffers->render_time);
+
+  Coverage coverage(kg, tile);
+  if (use_coverage) {
+    coverage.init_path_trace();
+  }
+
+  float *render_buffer = (float *)tile.buffer;
+  int start_sample = tile.start_sample;
+  int end_sample = tile.start_sample + tile.num_samples;
+
+  /* Needed for Embree. */
+  SIMD_SET_FLUSH_TO_ZERO;
+
+  for (int sample = start_sample; sample < end_sample; sample++) {
+    if (task.get_cancel() || TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+
+    if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
+      tile.stealing_state = RenderTile::WAS_STOLEN;
+      break;
+    }
+
+    if (tile.task == RenderTile::PATH_TRACE) {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          if (use_coverage) {
+            coverage.init_pixel(x, y);
+          }
+          kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    else {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    tile.sample = sample + 1;
+
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+      const bool stop = adaptive_sampling_filter(kg, tile, sample);
+      if (stop) {
+        const int num_progress_samples = end_sample - sample;
+        tile.sample = end_sample;
+        task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+        break;
+      }
+    }
+
+    task.update_progress(&tile, tile.w * tile.h);
+  }
+  if (use_coverage) {
+    coverage.finalize();
+  }
+
+  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
+    adaptive_sampling_post(tile, kg);
+  }
+}
+
+void CPUDevice::thread_render(DeviceTask &task)
+{
+  if (TaskPool::canceled()) {
+    if (task.need_finish_queue == false)
+      return;
+  }
+
+  /* allocate buffer for kernel globals */
+  CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
+
+  profiler.add_state(&kg.profiler);
+
+  /* NLM denoiser. */
+  DenoisingTask *denoising = NULL;
+
+  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+   * avoid waiting with mutex locks in the denoiser, we let only a single
+   * thread acquire denoising tiles. */
+  uint tile_types = task.tile_types;
+  bool hold_denoise_lock = false;
+  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+    if (!oidn_task_lock.try_lock()) {
+      tile_types &= ~RenderTile::DENOISE;
+      hold_denoise_lock = true;
+    }
+  }
+
+  RenderTile tile;
+  while (task.acquire_tile(this, tile, tile_types)) {
+    if (tile.task == RenderTile::PATH_TRACE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::BAKE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::DENOISE) {
+      denoise_openimagedenoise(task, tile);
+      task.update_progress(&tile, tile.w * tile.h);
+    }
+
+    task.release_tile(tile);
+
+    if (TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  if (hold_denoise_lock) {
+    oidn_task_lock.unlock();
+  }
+
+  profiler.remove_state(&kg.profiler);
+
+  delete denoising;
+}
+
+void CPUDevice::thread_denoise(DeviceTask &task)
+{
+  RenderTile tile;
+  tile.x = task.x;
+  tile.y = task.y;
+  tile.w = task.w;
+  tile.h = task.h;
+  tile.buffer = task.buffer;
+  tile.sample = task.sample + task.num_samples;
+  tile.num_samples = task.num_samples;
+  tile.start_sample = task.sample;
+  tile.offset = task.offset;
+  tile.stride = task.stride;
+  tile.buffers = task.buffers;
+
+  denoise_openimagedenoise(task, tile);
+
+  task.update_progress(&tile, tile.w * tile.h);
+}
+#endif
+
+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+  return &kernels;
+}
+
+void CPUDevice::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  /* Ensure latest texture info is loaded into kernel globals before returning. */
+  load_texture_info();
+
+  kernel_thread_globals.clear();
+  void *osl_memory = get_cpu_osl_memory();
+  for (int i = 0; i < info.cpu_threads; i++) {
+    kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler);
+  }
+}
+
+void *CPUDevice::get_cpu_osl_memory()
+{
+#ifdef WITH_OSL
+  return &osl_globals;
+#else
+  return NULL;
+#endif
+}
+
+bool CPUDevice::load_kernels(const uint /*kernel_features*/)
+{
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
new file mode 100644
index 00000000000..7d222808652
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+#include "device/device_memory.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+class CPUDevice : public Device {
+ public:
+  KernelGlobals kernel_globals;
+
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+#ifdef WITH_OSL
+  OSLGlobals osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  RTCScene embree_scene = NULL;
+  RTCDevice embree_device;
+#endif
+
+  CPUKernels kernels;
+
+  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
+  ~CPUDevice();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  bool load_texture_info();
+
+  virtual void mem_alloc(device_memory &mem) override;
+  virtual void mem_copy_to(device_memory &mem) override;
+  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  virtual void mem_zero(device_memory &mem) override;
+  virtual void mem_free(device_memory &mem) override;
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+  void tex_free(device_texture &mem);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  virtual const CPUKernels *get_cpu_kernels() const override;
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
+  virtual void *get_cpu_osl_memory() override;
+
+ protected:
+  virtual bool load_kernels(uint /*kernel_features*/) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..0ab58ff8600
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel.h"
+
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_FUNCTIONS(name) \
+  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+
+CPUKernels::CPUKernels()
+    : /* Integrator. */
+      REGISTER_KERNEL(integrator_init_from_camera),
+      REGISTER_KERNEL(integrator_init_from_bake),
+      REGISTER_KERNEL(integrator_intersect_closest),
+      REGISTER_KERNEL(integrator_intersect_shadow),
+      REGISTER_KERNEL(integrator_intersect_subsurface),
+      REGISTER_KERNEL(integrator_intersect_volume_stack),
+      REGISTER_KERNEL(integrator_shade_background),
+      REGISTER_KERNEL(integrator_shade_light),
+      REGISTER_KERNEL(integrator_shade_shadow),
+      REGISTER_KERNEL(integrator_shade_surface),
+      REGISTER_KERNEL(integrator_shade_volume),
+      REGISTER_KERNEL(integrator_megakernel),
+      /* Shader evaluation. */
+      REGISTER_KERNEL(shader_eval_displace),
+      REGISTER_KERNEL(shader_eval_background),
+      /* Adaptive campling. */
+      REGISTER_KERNEL(adaptive_sampling_convergence_check),
+      REGISTER_KERNEL(adaptive_sampling_filter_x),
+      REGISTER_KERNEL(adaptive_sampling_filter_y),
+      /* Cryptomatte. */
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Bake. */
+      REGISTER_KERNEL(bake)
+{
+}
+
+#undef REGISTER_KERNEL
+#undef KERNEL_FUNCTIONS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
new file mode 100644
index 00000000000..54b18308544
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/cpu/kernel_function.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelGlobals;
+struct IntegratorStateCPU;
+struct TileInfo;
+
+class CPUKernels {
+ public:
+  /* Integrator. */
+
+  using IntegratorFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>;
+  using IntegratorShadeFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
+  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                                            IntegratorStateCPU *state,
+                                                            KernelWorkTile *tile,
+                                                            ccl_global float *render_buffer)>;
+
+  IntegratorInitFunction integrator_init_from_camera;
+  IntegratorInitFunction integrator_init_from_bake;
+  IntegratorFunction integrator_intersect_closest;
+  IntegratorFunction integrator_intersect_shadow;
+  IntegratorFunction integrator_intersect_subsurface;
+  IntegratorFunction integrator_intersect_volume_stack;
+  IntegratorShadeFunction integrator_shade_background;
+  IntegratorShadeFunction integrator_shade_light;
+  IntegratorShadeFunction integrator_shade_shadow;
+  IntegratorShadeFunction integrator_shade_surface;
+  IntegratorShadeFunction integrator_shade_volume;
+  IntegratorShadeFunction integrator_megakernel;
+
+  /* Shader evaluation. */
+
+  using ShaderEvalFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+
+  ShaderEvalFunction shader_eval_displace;
+  ShaderEvalFunction shader_eval_background;
+
+  /* Adaptive stopping. */
+
+  using AdaptiveSamplingConvergenceCheckFunction =
+      CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int y,
+                                 float threshold,
+                                 bool reset,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterXFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int y,
+                                 int start_x,
+                                 int width,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterYFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int start_y,
+                                 int height,
+                                 int offset,
+                                 int stride)>;
+
+  AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;
+
+  AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
+  AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+
+  /* Cryptomatte. */
+
+  using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+  CryptomattePostprocessFunction cryptomatte_postprocess;
+
+  /* Bake. */
+
+  CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
+
+  CPUKernels();
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
new file mode 100644
index 00000000000..aa18720cc24
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_debug.h"
+#include "util/util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* A wrapper around per-microarchitecture variant of a kernel function.
+ *
+ * Provides a function-call-like API which gets routed to the most suitable implementation.
+ *
+ * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+template<typename FunctionType> class CPUKernelFunction {
+ public:
+  CPUKernelFunction(FunctionType kernel_default,
+                    FunctionType kernel_sse2,
+                    FunctionType kernel_sse3,
+                    FunctionType kernel_sse41,
+                    FunctionType kernel_avx,
+                    FunctionType kernel_avx2)
+  {
+    kernel_info_ = get_best_kernel_info(
+        kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
+  }
+
+  template<typename... Args> inline auto operator()(Args... args) const
+  {
+    assert(kernel_info_.kernel);
+
+    return kernel_info_.kernel(args...);
+  }
+
+  const char *get_uarch_name() const
+  {
+    return kernel_info_.uarch_name;
+  }
+
+ protected:
+  /* Helper class which allows to pass human-readable microarchitecture name together with function
+   * pointer. */
+  class KernelInfo {
+   public:
+    KernelInfo() : KernelInfo("", nullptr)
+    {
+    }
+
+    /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without
+     * memory allocation. */
+    KernelInfo(const char *uarch_name, FunctionType kernel)
+        : uarch_name(uarch_name), kernel(kernel)
+    {
+    }
+
+    const char *uarch_name;
+    FunctionType kernel;
+  };
+
+  KernelInfo get_best_kernel_info(FunctionType kernel_default,
+                                  FunctionType kernel_sse2,
+                                  FunctionType kernel_sse3,
+                                  FunctionType kernel_sse41,
+                                  FunctionType kernel_avx,
+                                  FunctionType kernel_avx2)
+  {
+    /* Silence warnings about unused variables when compiling without some architectures. */
+    (void)kernel_sse2;
+    (void)kernel_sse3;
+    (void)kernel_sse41;
+    (void)kernel_avx;
+    (void)kernel_avx2;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+      return KernelInfo("AVX2", kernel_avx2);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+    if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+      return KernelInfo("AVX", kernel_avx);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+    if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+      return KernelInfo("SSE4.1", kernel_sse41);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+    if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+      return KernelInfo("SSE3", kernel_sse3);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+      return KernelInfo("SSE2", kernel_sse2);
+    }
+#endif
+
+    return KernelInfo("default", kernel_default);
+  }
+
+  KernelInfo kernel_info_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
new file mode 100644
index 00000000000..988b00cd1f0
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel_thread_globals.h"
+
+// clang-format off
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "util/util_profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                                               void *osl_globals_memory,
+                                               Profiler &cpu_profiler)
+    : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler)
+{
+  reset_runtime_memory();
+
+#ifdef WITH_OSL
+  OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory));
+#else
+  (void)osl_globals_memory;
+#endif
+}
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
+    : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_)
+{
+  other.reset_runtime_memory();
+}
+
+CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
+{
+#ifdef WITH_OSL
+  OSLShader::thread_free(this);
+#endif
+}
+
+CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
+{
+  if (this == &other) {
+    return *this;
+  }
+
+  *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other);
+
+  other.reset_runtime_memory();
+
+  return *this;
+}
+
+void CPUKernelThreadGlobals::reset_runtime_memory()
+{
+#ifdef WITH_OSL
+  osl = nullptr;
+#endif
+}
+
+void CPUKernelThreadGlobals::start_profiling()
+{
+  cpu_profiler_.add_state(&profiler);
+}
+
+void CPUKernelThreadGlobals::stop_profiling()
+{
+  cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
new file mode 100644
index 00000000000..d005c3bb56c
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Profiler;
+
+/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource
+ * which is not thread-safe for access. Every worker thread which needs to operate on
+ * `KernelGlobals` needs to initialize its own copy of this object.
+ *
+ * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
+ * there is no unnecessary data duplication happening when using this object. */
+class CPUKernelThreadGlobals : public KernelGlobals {
+ public:
+  /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
+   * without OSL support. Will avoid need to those unnamed pointers and casts. */
+  CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                         void *osl_globals_memory,
+                         Profiler &cpu_profiler);
+
+  ~CPUKernelThreadGlobals();
+
+  CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
+
+  CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
+
+  void start_profiling();
+  void stop_profiling();
+
+ protected:
+  void reset_runtime_memory();
+
+  Profiler &cpu_profiler_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp
index 2e225ecfaf8..84becd6d081 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -14,21 +14,25 @@
  * limitations under the License.
  */
 
-#ifdef WITH_CUDA
+#include "device/cuda/device.h"
+
+#include "util/util_logging.h"
 
-#  include "device/cuda/device_cuda.h"
+#ifdef WITH_CUDA
+#  include "device/cuda/device_impl.h"
 #  include "device/device.h"
-#  include "device/device_intern.h"
 
-#  include "util/util_logging.h"
 #  include "util/util_string.h"
 #  include "util/util_windows.h"
+#endif /* WITH_CUDA */
 
 CCL_NAMESPACE_BEGIN
 
 bool device_cuda_init()
 {
-#  ifdef WITH_CUDA_DYNLOAD
+#if !defined(WITH_CUDA)
+  return false;
+#elif defined(WITH_CUDA_DYNLOAD)
   static bool initialized = false;
   static bool result = false;
 
@@ -59,16 +63,27 @@ bool device_cuda_init()
   }
 
   return result;
-#  else  /* WITH_CUDA_DYNLOAD */
+#else  /* WITH_CUDA_DYNLOAD */
   return true;
-#  endif /* WITH_CUDA_DYNLOAD */
+#endif /* WITH_CUDA_DYNLOAD */
 }
 
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new CUDADevice(info, stats, profiler, background);
+#ifdef WITH_CUDA
+  return new CUDADevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
 }
 
+#ifdef WITH_CUDA
 static CUresult device_cuda_safe_init()
 {
 #  ifdef _WIN32
@@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init()
   return cuInit(0);
 #  endif
 }
+#endif /* WITH_CUDA */
 
 void device_cuda_info(vector<DeviceInfo> &devices)
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE)
@@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
     info.has_half_images = (major >= 3);
     info.has_nanovdb = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
 
     /* Check if the device has P2P access to any other device in the system. */
     for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
@@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
   if (!display_devices.empty())
     devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_CUDA */
+  (void)devices;
+#endif /* WITH_CUDA */
 }
 
 string device_cuda_capabilities()
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE) {
@@ -310,8 +331,10 @@ string device_cuda_capabilities()
   }
 
   return capabilities;
+
+#else  /* WITH_CUDA */
+  return "";
+#endif /* WITH_CUDA */
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/device/cuda/device.h
index e68d4104a91..b0484904d1a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
+++ b/intern/cycles/device/cuda/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,24 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
+#pragma once
 
-#define KERNEL_NAME enqueue_inactive
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_cuda_init();
+
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cuda_info(vector<DeviceInfo> &devices);
+
+string device_cuda_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
deleted file mode 100644
index c3271c3cfcf..00000000000
--- a/intern/cycles/device/cuda/device_cuda.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_task.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include "cuew.h"
-#  else
-#    include "util/util_opengl.h"
-#    include <cuda.h>
-#    include <cudaGL.h>
-#  endif
-
-CCL_NAMESPACE_BEGIN
-
-class CUDASplitKernel;
-
-class CUDADevice : public Device {
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int pitch_alignment;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  /* Kernels */
-  struct {
-    bool loaded;
-
-    CUfunction adaptive_stopping;
-    CUfunction adaptive_filter_x;
-    CUfunction adaptive_filter_y;
-    CUfunction adaptive_scale_samples;
-    int adaptive_num_threads_per_block;
-  } functions;
-
-  static bool have_precompiled_kernels();
-
-  virtual bool show_samples() const override;
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
-
-  void set_error(const string &error) override;
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
-
-  virtual ~CUDADevice();
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
-
-  bool check_peer_access(Device *peer_device) override;
-
-  bool use_adaptive_compilation();
-
-  bool use_split_kernel();
-
-  virtual string compile_kernel_get_common_cflags(
-      const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        const char *name,
-                        const char *base = "cuda",
-                        bool force_ptx = false);
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
-
-  void load_functions();
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
-
-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
-
-  void mem_alloc(device_memory &mem) override;
-
-  void mem_copy_to(device_memory &mem) override;
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
-
-  void mem_zero(device_memory &mem) override;
-
-  void mem_free(device_memory &mem) override;
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override;
-
-  void global_alloc(device_memory &mem);
-
-  void global_free(device_memory &mem);
-
-  void tex_alloc(device_texture &mem);
-
-  void tex_free(device_texture &mem);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-
-  bool denoising_construct_transform(DenoisingTask *task);
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising);
-
-  void adaptive_sampling_filter(uint filter_sample,
-                                WorkTile *wtile,
-                                CUdeviceptr d_wtile,
-                                CUstream stream = 0);
-  void adaptive_sampling_post(RenderTile &rtile,
-                              WorkTile *wtile,
-                              CUdeviceptr d_wtile,
-                              CUstream stream = 0);
-
-  void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-
-  void shader(DeviceTask &task);
-
-  CUdeviceptr map_pixels(device_ptr mem);
-
-  void unmap_pixels(device_ptr mem);
-
-  void pixels_alloc(device_memory &mem);
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h);
-
-  void pixels_free(device_memory &mem);
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override;
-
-  void thread_run(DeviceTask &task);
-
-  virtual void task_add(DeviceTask &task) override;
-
-  virtual void task_wait() override;
-
-  virtual void task_cancel() override;
-};
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
deleted file mode 100644
index 2d2fcb38705..00000000000
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ /dev/null
@@ -1,2714 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include <climits>
-#  include <limits.h>
-#  include <stdio.h>
-#  include <stdlib.h>
-#  include <string.h>
-
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_intern.h"
-#  include "device/device_split_kernel.h"
-
-#  include "render/buffers.h"
-
-#  include "kernel/filter/filter_defines.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_map.h"
-#  include "util/util_md5.h"
-#  include "util/util_opengl.h"
-#  include "util/util_path.h"
-#  include "util/util_string.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-#  include "util/util_types.h"
-#  include "util/util_windows.h"
-
-#  include "kernel/split/kernel_split_data_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-#  ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#  endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-bool CUDADevice::have_precompiled_kernels()
-{
-  string cubins_path = path_get("lib");
-  return path_exists(cubins_path);
-}
-
-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
-{
-  return BVH_LAYOUT_BVH2;
-}
-
-void CUDADevice::set_error(const string &error)
-{
-  Device::set_error(error);
-
-  if (first_error) {
-    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-    fprintf(stderr,
-            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
-    first_error = false;
-  }
-}
-
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  first_error = true;
-  background = background_;
-
-  cuDevId = info.num;
-  cuDevice = 0;
-  cuContext = 0;
-
-  cuModule = 0;
-  cuFilterModule = 0;
-
-  split_kernel = NULL;
-
-  need_texture_info = false;
-
-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
-  pitch_alignment = 0;
-
-  functions.loaded = false;
-
-  /* Initialize CUDA. */
-  CUresult result = cuInit(0);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  /* Setup device and context. */
-  result = cuDeviceGet(&cuDevice, cuDevId);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
-                            cuewErrorString(result)));
-    return;
-  }
-
-  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-  cuda_assert(cuDeviceGetAttribute(
-      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-
-  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-  if (can_map_host) {
-    ctx_flags |= CU_CTX_MAP_HOST;
-    init_host_memory();
-  }
-
-  /* Create context. */
-  if (background) {
-    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-  }
-  else {
-    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-    if (result != CUDA_SUCCESS) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-      background = true;
-    }
-  }
-
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-  cuDevArchitecture = major * 100 + minor * 10;
-
-  /* Pop context set by cuCtxCreate. */
-  cuCtxPopCurrent(NULL);
-}
-
-CUDADevice::~CUDADevice()
-{
-  task_pool.cancel();
-
-  delete split_kernel;
-
-  texture_info.free();
-
-  cuda_assert(cuCtxDestroy(cuContext));
-}
-
-bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
-{
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* We only support sm_30 and above */
-  if (major < 3) {
-    set_error(string_printf(
-        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
-    return false;
-  }
-
-  return true;
-}
-
-bool CUDADevice::check_peer_access(Device *peer_device)
-{
-  if (peer_device == this) {
-    return false;
-  }
-  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
-    return false;
-  }
-
-  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
-
-  int can_access = 0;
-  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Ensure array access over the link is possible as well (for 3D textures)
-  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
-                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
-                                      cuDevice,
-                                      peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Enable peer access in both directions
-  {
-    const CUDAContextScope scope(this);
-    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-  {
-    const CUDAContextScope scope(peer_device_cuda);
-    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool CUDADevice::use_adaptive_compilation()
-{
-  return DebugFlags().cuda.adaptive_compile;
-}
-
-bool CUDADevice::use_split_kernel()
-{
-  return DebugFlags().cuda.split_kernel;
-}
-
-/* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
-string CUDADevice::compile_kernel_get_common_cflags(
-    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
-{
-  const int machine = system_cpu_bits();
-  const string source_path = path_get("source");
-  const string include_path = source_path;
-  string cflags = string_printf(
-      "-m%d "
-      "--ptxas-options=\"-v\" "
-      "--use_fast_math "
-      "-DNVCC "
-      "-I\"%s\"",
-      machine,
-      include_path.c_str());
-  if (!filter && use_adaptive_compilation()) {
-    cflags += " " + requested_features.get_build_options();
-  }
-  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-  if (extra_cflags) {
-    cflags += string(" ") + string(extra_cflags);
-  }
-
-  if (split) {
-    cflags += " -D__SPLIT__";
-  }
-
-#  ifdef WITH_NANOVDB
-  cflags += " -DWITH_NANOVDB";
-#  endif
-
-  return cflags;
-}
-
-string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
-                                  const char *name,
-                                  const char *base,
-                                  bool force_ptx)
-{
-  /* Compute kernel name. */
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* Attempt to use kernel provided with Blender. */
-  if (!use_adaptive_compilation()) {
-    if (!force_ptx) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-    }
-
-    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
-    int ptx_major = major, ptx_minor = minor;
-    while (ptx_major >= 3) {
-      const string ptx = path_get(
-          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-      if (path_exists(ptx)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return ptx;
-      }
-
-      if (ptx_minor > 0) {
-        ptx_minor--;
-      }
-      else {
-        ptx_major--;
-        ptx_minor = 9;
-      }
-    }
-  }
-
-  /* Try to use locally compiled kernel. */
-  string source_path = path_get("source");
-  const string source_md5 = path_files_md5_hash(source_path);
-
-  /* We include cflags into md5 so changing cuda toolkit or changing other
-   * compiler command line arguments makes sure cubin gets re-built.
-   */
-  string common_cflags = compile_kernel_get_common_cflags(
-      requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
-  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
-
-  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
-  const char *const kernel_arch = force_ptx ? "compute" : "sm";
-  const string cubin_file = string_printf(
-      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
-  const string cubin = path_cache_get(path_join("kernels", cubin_file));
-  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-  if (path_exists(cubin)) {
-    VLOG(1) << "Using locally compiled kernel.";
-    return cubin;
-  }
-
-#  ifdef _WIN32
-  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (major < 3) {
-      set_error(
-          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
-                        "Your GPU is not supported.",
-                        major,
-                        minor));
-    }
-    else {
-      set_error(
-          string_printf("CUDA binary kernel for this graphics card compute "
-                        "capability (%d.%d) not found.",
-                        major,
-                        minor));
-    }
-    return string();
-  }
-#  endif
-
-  /* Compile. */
-  const char *const nvcc = cuewCompilerPath();
-  if (nvcc == NULL) {
-    set_error(
-        "CUDA nvcc compiler not found. "
-        "Install CUDA toolkit in default location.");
-    return string();
-  }
-
-  const int nvcc_cuda_version = cuewCompilerVersion();
-  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
-  if (nvcc_cuda_version < 101) {
-    printf(
-        "Unsupported CUDA version %d.%d detected, "
-        "you need CUDA 10.1 or newer.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-    return string();
-  }
-  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
-             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
-    printf(
-        "CUDA version %d.%d detected, build may succeed but only "
-        "CUDA 10.1 to 11.4 are officially supported.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-  }
-
-  double starttime = time_dt();
-
-  path_create_directories(cubin);
-
-  source_path = path_join(path_join(source_path, "kernel"),
-                          path_join("kernels", path_join(base, string_printf("%s.cu", name))));
-
-  string command = string_printf(
-      "\"%s\" "
-      "-arch=%s_%d%d "
-      "--%s \"%s\" "
-      "-o \"%s\" "
-      "%s",
-      nvcc,
-      kernel_arch,
-      major,
-      minor,
-      kernel_ext,
-      source_path.c_str(),
-      cubin.c_str(),
-      common_cflags.c_str());
-
-  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
-
-#  ifdef _WIN32
-  command = "call " + command;
-#  endif
-  if (system(command.c_str()) != 0) {
-    set_error(
-        "Failed to execute compilation command, "
-        "see console for details.");
-    return string();
-  }
-
-  /* Verify if compilation succeeded */
-  if (!path_exists(cubin)) {
-    set_error(
-        "CUDA kernel compilation failed, "
-        "see console for details.");
-    return string();
-  }
-
-  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-  return cubin;
-}
-
-bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  /* TODO(sergey): Support kernels re-load for CUDA devices.
-   *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
-   */
-  if (cuFilterModule && cuModule) {
-    VLOG(1) << "Skipping kernel reload, not currently supported.";
-    return true;
-  }
-
-  /* check if cuda init succeeded */
-  if (cuContext == 0)
-    return false;
-
-  /* check if GPU is supported */
-  if (!support_device(requested_features))
-    return false;
-
-  /* get kernel */
-  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
-  string cubin = compile_kernel(requested_features, kernel_name);
-  if (cubin.empty())
-    return false;
-
-  const char *filter_name = "filter";
-  string filter_cubin = compile_kernel(requested_features, filter_name);
-  if (filter_cubin.empty())
-    return false;
-
-  /* open module */
-  CUDAContextScope scope(this);
-
-  string cubin_data;
-  CUresult result;
-
-  if (path_read_text(cubin, cubin_data))
-    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf(
-        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
-
-  if (path_read_text(filter_cubin, cubin_data))
-    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
-                            filter_cubin.c_str(),
-                            cuewErrorString(result)));
-
-  if (result == CUDA_SUCCESS) {
-    reserve_local_memory(requested_features);
-  }
-
-  load_functions();
-
-  return (result == CUDA_SUCCESS);
-}
-
-void CUDADevice::load_functions()
-{
-  /* TODO: load all functions here. */
-  if (functions.loaded) {
-    return;
-  }
-  functions.loaded = true;
-
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
-
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
-
-  int unused_min_blocks;
-  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
-                                               &functions.adaptive_num_threads_per_block,
-                                               functions.adaptive_scale_samples,
-                                               NULL,
-                                               0,
-                                               0));
-}
-
-void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-{
-  if (use_split_kernel()) {
-    /* Split kernel mostly uses global memory and adaptive compilation,
-     * difficult to predict how much is needed currently. */
-    return;
-  }
-
-  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-   * needed for kernel launches, so that we can reliably figure out when
-   * to allocate scene data in mapped host memory. */
-  CUDAContextScope scope(this);
-
-  size_t total = 0, free_before = 0, free_after = 0;
-  cuMemGetInfo(&free_before, &total);
-
-  /* Get kernel function. */
-  CUfunction cuRender;
-
-  if (requested_features.use_baking) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (requested_features.use_integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
-  /* Launch kernel, using just 1 block appears sufficient to reserve
-   * memory for all multiprocessors. It would be good to do this in
-   * parallel for the multi GPU case still to make it faster. */
-  CUdeviceptr d_work_tiles = 0;
-  uint total_work_size = 0;
-
-  void *args[] = {&d_work_tiles, &total_work_size};
-
-  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-  cuda_assert(cuCtxSynchronize());
-
-  cuMemGetInfo(&free_after, &total);
-  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#  if 0
-  /* For testing mapped host memory, fill up device memory. */
-  const size_t keep_mb = 1024;
-
-  while (free_after > keep_mb * 1024 * 1024LL) {
-    CUdeviceptr tmp;
-    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-    cuMemGetInfo(&free_after, &total);
-  }
-#  endif
-}
-
-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
-{
-  CUDAContextScope scope(this);
-
-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
-  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
-}
-
-void CUDADevice::generic_copy_to(device_memory &mem)
-{
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
-}
-
-void CUDADevice::generic_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
-
-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
-
-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-
-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
-}
-
-void CUDADevice::mem_alloc(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    assert(!"mem_alloc not supported for textures.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    assert(!"mem_alloc not supported for global memory.");
-  }
-  else {
-    generic_alloc(mem);
-  }
-}
-
-void CUDADevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS) {
-    assert(!"mem_copy_to not supported for pixels.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-    generic_copy_to(mem);
-  }
-}
-
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_copy_from(mem, y, w, h);
-  }
-  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
-    assert(!"mem_copy_from not supported for textures.");
-  }
-  else if (mem.host_pointer) {
-    const size_t size = elem * w * h;
-    const size_t offset = elem * y * w;
-
-    if (mem.device_pointer) {
-      const CUDAContextScope scope(this);
-      cuda_assert(cuMemcpyDtoH(
-          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-    }
-    else {
-      memset((char *)mem.host_pointer + offset, 0, size);
-    }
-  }
-}
-
-void CUDADevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-  if (!mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
-   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
-  }
-  else if (mem.host_pointer) {
-    memset(mem.host_pointer, 0, mem.memory_size());
-  }
-}
-
-void CUDADevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_free(mem);
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    generic_free(mem);
-  }
-}
-
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-{
-  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-}
-
-void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  CUDAContextScope scope(this);
-  CUdeviceptr mem;
-  size_t bytes;
-
-  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-  // assert(bytes == size);
-  cuda_assert(cuMemcpyHtoD(mem, host, size));
-}
-
-void CUDADevice::global_alloc(device_memory &mem)
-{
-  if (mem.is_resident(this)) {
-    generic_alloc(mem);
-    generic_copy_to(mem);
-  }
-
-  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
-}
-
-void CUDADevice::global_free(device_memory &mem)
-{
-  if (mem.is_resident(this) && mem.device_pointer) {
-    generic_free(mem);
-  }
-}
-
-void CUDADevice::tex_alloc(device_texture &mem)
-{
-  CUDAContextScope scope(this);
-
-  /* General variables for both architectures */
-  string bind_name = mem.name;
-  size_t dsize = datatype_size(mem.data_type);
-  size_t size = mem.memory_size();
-
-  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-  switch (mem.info.extension) {
-    case EXTENSION_REPEAT:
-      address_mode = CU_TR_ADDRESS_MODE_WRAP;
-      break;
-    case EXTENSION_EXTEND:
-      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-      break;
-    case EXTENSION_CLIP:
-      address_mode = CU_TR_ADDRESS_MODE_BORDER;
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  CUfilter_mode filter_mode;
-  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
-    filter_mode = CU_TR_FILTER_MODE_POINT;
-  }
-  else {
-    filter_mode = CU_TR_FILTER_MODE_LINEAR;
-  }
-
-  /* Image Texture Storage */
-  CUarray_format_enum format;
-  switch (mem.data_type) {
-    case TYPE_UCHAR:
-      format = CU_AD_FORMAT_UNSIGNED_INT8;
-      break;
-    case TYPE_UINT16:
-      format = CU_AD_FORMAT_UNSIGNED_INT16;
-      break;
-    case TYPE_UINT:
-      format = CU_AD_FORMAT_UNSIGNED_INT32;
-      break;
-    case TYPE_INT:
-      format = CU_AD_FORMAT_SIGNED_INT32;
-      break;
-    case TYPE_FLOAT:
-      format = CU_AD_FORMAT_FLOAT;
-      break;
-    case TYPE_HALF:
-      format = CU_AD_FORMAT_HALF;
-      break;
-    default:
-      assert(0);
-      return;
-  }
-
-  CUDAMem *cmem = NULL;
-  CUarray array_3d = NULL;
-  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-  size_t dst_pitch = src_pitch;
-
-  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-
-    if (mem.data_depth > 1) {
-      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      dst_pitch = align_up(src_pitch, pitch_alignment);
-    }
-  }
-  else if (mem.data_depth > 1) {
-    /* 3D texture using array, there is no API for linear memory. */
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-
-    desc.Width = mem.data_width;
-    desc.Height = mem.data_height;
-    desc.Depth = mem.data_depth;
-    desc.Format = format;
-    desc.NumChannels = mem.data_elements;
-    desc.Flags = 0;
-
-    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-    if (!array_3d) {
-      return;
-    }
-
-    CUDA_MEMCPY3D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-    param.dstArray = array_3d;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-    param.Depth = mem.data_depth;
-
-    cuda_assert(cuMemcpy3D(&param));
-
-    mem.device_pointer = (device_ptr)array_3d;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-    cmem->array = array_3d;
-  }
-  else if (mem.data_height > 0) {
-    /* 2D texture, using pitch aligned linear memory. */
-    dst_pitch = align_up(src_pitch, pitch_alignment);
-    size_t dst_size = dst_pitch * mem.data_height;
-
-    cmem = generic_alloc(mem, dst_size - mem.memory_size());
-    if (!cmem) {
-      return;
-    }
-
-    CUDA_MEMCPY2D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = mem.device_pointer;
-    param.dstPitch = dst_pitch;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-
-    cuda_assert(cuMemcpy2DUnaligned(&param));
-  }
-  else {
-    /* 1D texture, using linear memory. */
-    cmem = generic_alloc(mem);
-    if (!cmem) {
-      return;
-    }
-
-    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-  }
-
-  /* Resize once */
-  const uint slot = mem.slot;
-  if (slot >= texture_info.size()) {
-    /* Allocate some slots in advance, to reduce amount
-     * of re-allocations. */
-    texture_info.resize(slot + 128);
-  }
-
-  /* Set Mapping and tag that we need to (re-)upload to device */
-  texture_info[slot] = mem.info;
-  need_texture_info = true;
-
-  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    texture_info[slot].data = (uint64_t)cmem->texobject;
-  }
-  else {
-    texture_info[slot].data = (uint64_t)mem.device_pointer;
-  }
-}
-
-void CUDADevice::tex_free(device_texture &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    if (cmem.texobject) {
-      /* Free bindless texture. */
-      cuTexObjectDestroy(cmem.texobject);
-    }
-
-    if (!mem.is_resident(this)) {
-      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else if (cmem.array) {
-      /* Free array. */
-      cuArrayDestroy(cmem.array);
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else {
-      lock.unlock();
-      generic_free(mem);
-    }
-  }
-}
-
-#  define CUDA_GET_BLOCKSIZE(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int threads = (int)sqrt((float)threads_per_block); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = ((h) + threads - 1) / threads;
-
-#  define CUDA_LAUNCH_KERNEL(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-    int yblocks = h;
-
-#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
-                                           device_ptr guide_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr out_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-  int frame_offset = 0;
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr scale_ptr = 0;
-
-  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-  {
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-    void *calc_difference_args[] = {&guide_ptr,
-                                    &variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &channel_offset,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *update_output_args[] = {&blurDifference,
-                                  &image_ptr,
-                                  &out_ptr,
-                                  &weightAccum,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &channel_offset,
-                                  &r,
-                                  &f};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-  }
-
-  {
-    CUfunction cuNLMNormalize;
-    cuda_assert(
-        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterConstructTransform;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-  void *args[] = {&task->buffer.mem.device_pointer,
-                  &task->tile_info_mem.device_pointer,
-                  &task->storage.transform.device_pointer,
-                  &task->storage.rank.device_pointer,
-                  &task->filter_area,
-                  &task->rect,
-                  &task->radius,
-                  &task->pca_threshold,
-                  &task->buffer.pass_stride,
-                  &task->buffer.frame_stride,
-                  &task->buffer.use_time};
-  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
-                                      device_ptr color_variance_ptr,
-                                      device_ptr scale_ptr,
-                                      int frame,
-                                      DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int r = task->radius;
-  int f = 4;
-  float a = 1.0f;
-  float k_2 = task->nlm_k_2;
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-  cuda_assert(
-      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                        num_shifts);
-
-  void *calc_difference_args[] = {&color_ptr,
-                                  &color_variance_ptr,
-                                  &scale_ptr,
-                                  &difference,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &r,
-                                  &pass_stride,
-                                  &frame_offset,
-                                  &a,
-                                  &k_2};
-  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *construct_gramian_args[] = {&t,
-                                    &blurDifference,
-                                    &task->buffer.mem.device_pointer,
-                                    &task->storage.transform.device_pointer,
-                                    &task->storage.rank.device_pointer,
-                                    &task->storage.XtWX.device_pointer,
-                                    &task->storage.XtWY.device_pointer,
-                                    &task->reconstruction_state.filter_window,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &f,
-                                    &frame_offset,
-                                    &task->buffer.use_time};
-
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  CUfunction cuFinalize;
-  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-  void *finalize_args[] = {&output_ptr,
-                           &task->storage.rank.device_pointer,
-                           &task->storage.XtWX.device_pointer,
-                           &task->storage.XtWY.device_pointer,
-                           &task->filter_area,
-                           &task->reconstruction_state.buffer_params.x,
-                           &task->render_buffer.samples};
-  CUDA_GET_BLOCKSIZE(
-      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
-                                          device_ptr b_ptr,
-                                          device_ptr mean_ptr,
-                                          device_ptr variance_ptr,
-                                          int r,
-                                          int4 rect,
-                                          DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterCombineHalves;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
-                                         device_ptr b_ptr,
-                                         device_ptr sample_variance_ptr,
-                                         device_ptr sv_variance_ptr,
-                                         device_ptr buffer_variance_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDivideShadow;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &a_ptr,
-                  &b_ptr,
-                  &sample_variance_ptr,
-                  &sv_variance_ptr,
-                  &buffer_variance_ptr,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_get_feature(int mean_offset,
-                                       int variance_offset,
-                                       device_ptr mean_ptr,
-                                       device_ptr variance_ptr,
-                                       float scale,
-                                       DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterGetFeature;
-  cuda_assert(
-      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &mean_offset,
-                  &variance_offset,
-                  &mean_ptr,
-                  &variance_ptr,
-                  &scale,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_write_feature(int out_offset,
-                                         device_ptr from_ptr,
-                                         device_ptr buffer_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterWriteFeature;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->reconstruction_state.buffer_params,
-                  &task->filter_area,
-                  &from_ptr,
-                  &buffer_ptr,
-                  &out_offset,
-                  &task->rect};
-  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr depth_ptr,
-                                           device_ptr output_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDetectOutliers;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {
-      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
-
-  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &CUDADevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void CUDADevice::adaptive_sampling_filter(uint filter_sample,
-                                          WorkTile *wtile,
-                                          CUdeviceptr d_wtile,
-                                          CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-
-  /* These are a series of tiny kernels because there is no grid synchronization
-   * from within a kernel, so multiple kernel launches it is. */
-  uint total_work_size = wtile->h * wtile->w;
-  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->h;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->w;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-}
-
-void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
-                                        WorkTile *wtile,
-                                        CUdeviceptr d_wtile,
-                                        CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-  uint total_work_size = wtile->h * wtile->w;
-
-  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args,
-                             0));
-}
-
-void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-  CUfunction cuRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (task.integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  WorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(
-        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    cuda_assert(cuCtxSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    cuda_assert(cuCtxSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void CUDADevice::film_convert(DeviceTask &task,
-                              device_ptr buffer,
-                              device_ptr rgba_byte,
-                              device_ptr rgba_half)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilmConvert;
-  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
-
-  /* get kernel function */
-  if (rgba_half) {
-    cuda_assert(
-        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-  }
-
-  float sample_scale = 1.0f / (task.sample + 1);
-
-  /* pass in parameters */
-  void *args[] = {&d_rgba,
-                  &d_buffer,
-                  &sample_scale,
-                  &task.x,
-                  &task.y,
-                  &task.w,
-                  &task.h,
-                  &task.offset,
-                  &task.stride};
-
-  /* launch kernel */
-  int threads_per_block;
-  cuda_assert(cuFuncGetAttribute(
-      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-  int xthreads = (int)sqrt(threads_per_block);
-  int ythreads = (int)sqrt(threads_per_block);
-  int xblocks = (task.w + xthreads - 1) / xthreads;
-  int yblocks = (task.h + ythreads - 1) / ythreads;
-
-  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-  cuda_assert(cuLaunchKernel(cuFilmConvert,
-                             xblocks,
-                             yblocks,
-                             1, /* blocks */
-                             xthreads,
-                             ythreads,
-                             1, /* threads */
-                             0,
-                             0,
-                             args,
-                             0));
-
-  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-  cuda_assert(cuCtxSynchronize());
-}
-
-void CUDADevice::shader(DeviceTask &task)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuShader;
-  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
-  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
-
-  /* get kernel function */
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-  }
-
-  /* do tasks in smaller chunks, so we can cancel it */
-  const int shader_chunk_size = 65536;
-  const int start = task.shader_x;
-  const int end = task.shader_x + task.shader_w;
-  int offset = task.offset;
-
-  bool canceled = false;
-  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-      int shader_w = min(shader_chunk_size, end - shader_x);
-
-      /* pass in parameters */
-      void *args[8];
-      int arg = 0;
-      args[arg++] = &d_input;
-      args[arg++] = &d_output;
-      args[arg++] = &task.shader_eval_type;
-      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-        args[arg++] = &task.shader_filter;
-      }
-      args[arg++] = &shader_x;
-      args[arg++] = &shader_w;
-      args[arg++] = &offset;
-      args[arg++] = &sample;
-
-      /* launch kernel */
-      int threads_per_block;
-      cuda_assert(cuFuncGetAttribute(
-          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuLaunchKernel(cuShader,
-                                 xblocks,
-                                 1,
-                                 1, /* blocks */
-                                 threads_per_block,
-                                 1,
-                                 1, /* threads */
-                                 0,
-                                 0,
-                                 args,
-                                 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      if (task.get_cancel()) {
-        canceled = true;
-        break;
-      }
-    }
-
-    task.update_progress(NULL);
-  }
-}
-
-CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-    CUdeviceptr buffer;
-
-    size_t bytes;
-    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-    return buffer;
-  }
-
-  return (CUdeviceptr)mem;
-}
-
-void CUDADevice::unmap_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-
-    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-  }
-}
-
-void CUDADevice::pixels_alloc(device_memory &mem)
-{
-  PixelMem pmem;
-
-  pmem.w = mem.data_width;
-  pmem.h = mem.data_height;
-
-  CUDAContextScope scope(this);
-
-  glGenBuffers(1, &pmem.cuPBO);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  if (mem.data_type == TYPE_HALF)
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-  else
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &pmem.cuTexId);
-  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-  if (mem.data_type == TYPE_HALF)
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-  else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-  glBindTexture(GL_TEXTURE_2D, 0);
-
-  CUresult result = cuGraphicsGLRegisterBuffer(
-      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-  if (result == CUDA_SUCCESS) {
-    mem.device_pointer = pmem.cuTexId;
-    pixel_mem_map[mem.device_pointer] = pmem;
-
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    return;
-  }
-  else {
-    /* failed to register buffer, fallback to no interop */
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    background = true;
-  }
-}
-
-void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
-{
-  PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-  CUDAContextScope scope(this);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-  size_t offset = sizeof(uchar) * 4 * y * w;
-  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void CUDADevice::pixels_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-    mem.device_pointer = 0;
-
-    stats.mem_free(mem.device_size);
-    mem.device_size = 0;
-  }
-}
-
-void CUDADevice::draw_pixels(device_memory &mem,
-                             int y,
-                             int w,
-                             int h,
-                             int width,
-                             int height,
-                             int dx,
-                             int dy,
-                             int dw,
-                             int dh,
-                             bool transparent,
-                             const DeviceDrawParams &draw_params)
-{
-  assert(mem.type == MEM_PIXELS);
-
-  if (!background) {
-    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-    float *vpointer;
-
-    CUDAContextScope scope(this);
-
-    /* for multi devices, this assumes the inefficient method that we allocate
-     * all pixels on the device even though we only render to a subset */
-    size_t offset = 4 * y * w;
-
-    if (mem.data_type == TYPE_HALF)
-      offset *= sizeof(GLhalf);
-    else
-      offset *= sizeof(uint8_t);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF) {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-    }
-    else {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-    }
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    if (transparent) {
-      glEnable(GL_BLEND);
-      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-    }
-
-    GLint shader_program;
-    if (use_fallback_shader) {
-      if (!bind_fallback_display_space_shader(dw, dh)) {
-        return;
-      }
-      shader_program = fallback_shader_program;
-    }
-    else {
-      draw_params.bind_display_space_shader_cb();
-      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-    }
-
-    if (!vertex_buffer) {
-      glGenBuffers(1, &vertex_buffer);
-    }
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-    /* invalidate old contents -
-     * avoids stalling if buffer is still waiting in queue to be rendered */
-    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-    if (vpointer) {
-      /* texture coordinate - vertex pair */
-      vpointer[0] = 0.0f;
-      vpointer[1] = 0.0f;
-      vpointer[2] = dx;
-      vpointer[3] = dy;
-
-      vpointer[4] = (float)w / (float)pmem.w;
-      vpointer[5] = 0.0f;
-      vpointer[6] = (float)width + dx;
-      vpointer[7] = dy;
-
-      vpointer[8] = (float)w / (float)pmem.w;
-      vpointer[9] = (float)h / (float)pmem.h;
-      vpointer[10] = (float)width + dx;
-      vpointer[11] = (float)height + dy;
-
-      vpointer[12] = 0.0f;
-      vpointer[13] = (float)h / (float)pmem.h;
-      vpointer[14] = dx;
-      vpointer[15] = (float)height + dy;
-
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-
-    GLuint vertex_array_object;
-    GLuint position_attribute, texcoord_attribute;
-
-    glGenVertexArrays(1, &vertex_array_object);
-    glBindVertexArray(vertex_array_object);
-
-    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-    position_attribute = glGetAttribLocation(shader_program, "pos");
-
-    glEnableVertexAttribArray(texcoord_attribute);
-    glEnableVertexAttribArray(position_attribute);
-
-    glVertexAttribPointer(
-        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-    glVertexAttribPointer(position_attribute,
-                          2,
-                          GL_FLOAT,
-                          GL_FALSE,
-                          4 * sizeof(float),
-                          (const GLvoid *)(sizeof(float) * 2));
-
-    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-    if (use_fallback_shader) {
-      glUseProgram(0);
-    }
-    else {
-      draw_params.unbind_display_space_shader_cb();
-    }
-
-    if (transparent) {
-      glDisable(GL_BLEND);
-    }
-
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    return;
-  }
-
-  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    DeviceRequestedFeatures requested_features;
-    if (use_split_kernel()) {
-      if (split_kernel == NULL) {
-        split_kernel = new CUDASplitKernel(this);
-        split_kernel->load_kernels(requested_features);
-      }
-    }
-
-    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel()) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-        }
-        else {
-          render(task, tile, work_tiles);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-
-        denoise(tile, denoising);
-
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void CUDADevice::task_add(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  /* Load texture info. */
-  load_texture_info();
-
-  /* Synchronize all memory copies before executing task. */
-  cuda_assert(cuCtxSynchronize());
-
-  if (task.type == DeviceTask::FILM_CONVERT) {
-    /* must be done in main thread due to opengl access */
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-}
-
-void CUDADevice::task_wait()
-{
-  task_pool.wait();
-}
-
-void CUDADevice::task_cancel()
-{
-  task_pool.cancel();
-}
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#  undef cuda_assert
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        device->set_error( \
-            string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
-  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
-  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
-  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
-  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
-
-  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  const CUDAContextScope scope(device);
-
-  CUfunction func;
-  const CUresult result = cuModuleGetFunction(
-      &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
-  if (result != CUDA_SUCCESS) {
-    device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
-                                    kernel_name.data(),
-                                    cuewErrorString(result)));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask & /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
new file mode 100644
index 00000000000..37fab8f8293
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -0,0 +1,1370 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_impl.h"
+
+#  include "render/buffers.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  /* Initialize CUDA. */
+  CUresult result = cuInit(0);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = cuDeviceGet(&cuDevice, cuDevId);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+                            cuewErrorString(result)));
+    return;
+  }
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const uint /*kernel_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    set_error(string_printf(
+        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (use_adaptive_compilation()) {
+    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+
+#  ifdef WITH_NANOVDB
+  cflags += " -DWITH_NANOVDB";
+#  endif
+
+  return cflags;
+}
+
+string CUDADevice::compile_kernel(const uint kernel_features,
+                                  const char *name,
+                                  const char *base,
+                                  bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string cubin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    set_error(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return string();
+  }
+
+  const int nvcc_cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+  if (nvcc_cuda_version < 101) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 10.1 or newer.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+    return string();
+  }
+  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
+             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 to 11.4 are officially supported.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(cubin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("device", path_join(base, string_printf("%s.cu", name))));
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=%s_%d%d "
+      "--%s \"%s\" "
+      "-o \"%s\" "
+      "%s",
+      nvcc,
+      kernel_arch,
+      major,
+      minor,
+      kernel_ext,
+      source_path.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    set_error(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const uint kernel_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(kernel_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = "kernel";
+  string cubin = compile_kernel(kernel_features, kernel_name);
+  if (cubin.empty())
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf(
+        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+  if (result == CUDA_SUCCESS) {
+    kernels.load(this);
+    reserve_local_memory(kernel_features);
+  }
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+{
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  size_t total = 0, free_before = 0, free_after = 0;
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_before, &total);
+  }
+
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    CUDADeviceQueue queue(this);
+
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_after, &total);
+  }
+
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(
+        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuda_assert(cuMemFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  need_texture_info = true;
+
+  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+
+    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    texture_info[slot].data = (uint64_t)cmem->texobject;
+  }
+  else {
+    texture_info[slot].data = (uint64_t)mem.device_pointer;
+  }
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      lock.unlock();
+      generic_free(mem);
+    }
+  }
+}
+
+#  if 0
+void CUDADevice::render(DeviceTask &task,
+                        RenderTile &rtile,
+                        device_vector<KernelWorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  KernelWorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  uint start_sample = rtile.start_sample;
+  uint end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample;) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = step_samples;
+    if (task.adaptive_sampling.use) {
+      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+    }
+    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    sample += wtile->num_samples;
+    rtile.sample = sample;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+}
+#  endif
+
+unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
+{
+  return make_unique<CUDADeviceQueue>(this);
+}
+
+bool CUDADevice::should_use_graphics_interop()
+{
+  /* Check whether this device is part of OpenGL context.
+   *
+   * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
+   * possible, but from the empiric measurements it can be considerably slower than using naive
+   * pixels copy. */
+
+  CUDAContextScope scope(this);
+
+  int num_all_devices = 0;
+  cuda_assert(cuDeviceGetCount(&num_all_devices));
+
+  if (num_all_devices == 0) {
+    return false;
+  }
+
+  vector<CUdevice> gl_devices(num_all_devices);
+  uint num_gl_devices;
+  cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
+
+  for (CUdevice gl_device : gl_devices) {
+    if (gl_device == cuDevice) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int CUDADevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+  CUDAContextScope scope(this);
+
+  return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
new file mode 100644
index 00000000000..6b27db54ab4
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/queue.h"
+#  include "device/cuda/util.h"
+#  include "device/device.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class CUDADevice : public Device {
+
+  friend class CUDAContextScope;
+
+ public:
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+  thread_mutex cuda_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUDADeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "cuda",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(CUdevice_attribute attribute, int *value);
+  int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
new file mode 100644
index 00000000000..e8ca8b90eae
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/graphics_interop.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue)
+    : queue_(queue), device_(static_cast<CUDADevice *>(queue->device))
+{
+}
+
+CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
+{
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+}
+
+void CUDADeviceGraphicsInterop::set_destination(
+    const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  need_clear_ = destination.need_clear;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+
+  const CUresult result = cuGraphicsGLRegisterBuffer(
+      &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr CUDADeviceGraphicsInterop::map()
+{
+  if (!cu_graphics_resource_) {
+    return 0;
+  }
+
+  CUDAContextScope scope(device_);
+
+  CUdeviceptr cu_buffer;
+  size_t bytes;
+
+  cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream()));
+  cuda_device_assert(
+      device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_));
+
+  if (need_clear_) {
+    cuda_device_assert(
+        device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream()));
+
+    need_clear_ = false;
+  }
+
+  return static_cast<device_ptr>(cu_buffer);
+}
+
+void CUDADeviceGraphicsInterop::unmap()
+{
+  CUDAContextScope scope(device_);
+
+  cuda_device_assert(device_,
+                     cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
new file mode 100644
index 00000000000..8a70c8aa71d
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class CUDADeviceQueue;
+
+class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue);
+
+  CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~CUDADeviceGraphicsInterop();
+
+  CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  CUDADeviceQueue *queue_ = nullptr;
+  CUDADevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  /* The destination was requested to be cleared. */
+  bool need_clear_ = false;
+
+  CUgraphicsResource cu_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
new file mode 100644
index 00000000000..0ed20ddf8e6
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void CUDADeviceKernels::load(CUDADevice *device)
+{
+  CUmodule cuModule = device->cuModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    CUDADeviceKernel &kernel = kernels_[i];
+
+    /* No megakernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    cuda_device_assert(device,
+                       cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
+
+    if (kernel.function) {
+      cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1));
+
+      cuda_device_assert(
+          device,
+          cuOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool CUDADeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA*/
diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h
new file mode 100644
index 00000000000..b489547a350
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* CUDA kernel and associate occupancy information. */
+class CUDADeviceKernel {
+ public:
+  CUfunction function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of CUDA kernels for each DeviceKernel. */
+class CUDADeviceKernels {
+ public:
+  void load(CUDADevice *device);
+  const CUDADeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/queue.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/graphics_interop.h"
+#  include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states = max(cuda_device_->get_num_multiprocessors() *
+                           cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+                       1048576);
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    num_states = max((int)(num_states * atof(factor_str)), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+                              cuda_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  CUDAContextScope scope(cuda_device_);
+  cuda_device_->load_texture_info();
+  cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+  debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  cuda_device_assert(cuda_device_,
+                     cuLaunchKernel(cuda_kernel.function,
+                                    num_blocks,
+                                    1,
+                                    1,
+                                    num_threads_per_block,
+                                    1,
+                                    1,
+                                    shared_mem_bytes,
+                                    cuda_stream_,
+                                    args,
+                                    0));
+
+  return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  debug_synchronize();
+
+  return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyDtoHAsync(
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+  return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
new file mode 100644
index 00000000000..62e3aa3d6c2
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class device_memory;
+
+/* Base class for CUDA queues. */
+class CUDADeviceQueue : public DeviceQueue {
+ public:
+  CUDADeviceQueue(CUDADevice *device);
+  ~CUDADeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual CUstream stream()
+  {
+    return cuda_stream_;
+  }
+
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  CUDADevice *cuda_device_;
+  CUstream cuda_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp
new file mode 100644
index 00000000000..8f657cc10fe
--- /dev/null
+++ b/intern/cycles/device/cuda/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/util.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_device_assert(device, cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_device_assert(device, cuCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_CUDA_DYNLOAD
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h
new file mode 100644
index 00000000000..a0898094c08
--- /dev/null
+++ b/intern/cycles/device/cuda/util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+/* Utility for checking return values of CUDA function calls. */
+#  define cuda_device_assert(cuda_device, stmt) \
+    { \
+      CUresult result = stmt; \
+      if (result != CUDA_SUCCESS) { \
+        const char *name = cuewErrorString(result); \
+        cuda_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define cuda_assert(stmt) cuda_device_assert(this, stmt)
+
+#  ifndef WITH_CUDA_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all. */
+const char *cuewErrorString(CUresult result);
+const char *cuewCompilerPath();
+int cuewCompilerVersion();
+#  endif /* WITH_CUDA_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ed53fbb54ae..6ccedcf54ef 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,7 +20,13 @@
 #include "bvh/bvh2.h"
 
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/device.h"
+#include "device/cuda/device.h"
+#include "device/dummy/device.h"
+#include "device/multi/device.h"
+#include "device/optix/device.h"
 
 #include "util/util_foreach.h"
 #include "util/util_half.h"
@@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN
 bool Device::need_types_update = true;
 bool Device::need_devices_update = true;
 thread_mutex Device::device_mutex;
-vector<DeviceInfo> Device::opencl_devices;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
-vector<DeviceInfo> Device::network_devices;
 uint Device::devices_initialized_mask = 0;
 
-/* Device Requested Features */
-
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
-{
-  os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
-  os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
-  /* TODO(sergey): Decode bitflag into list of names. */
-  os << "Nodes features: " << requested_features.nodes_features << std::endl;
-  os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
-  os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
-     << std::endl;
-  os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
-     << std::endl;
-  os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
-  os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
-  os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
-  os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
-     << std::endl;
-  os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
-     << std::endl;
-  os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
-     << std::endl;
-  os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
-     << std::endl;
-  os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
-  os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
-     << std::endl;
-  os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
-     << std::endl;
-  return os;
-}
-
 /* Device */
 
 Device::~Device() noexcept(false)
 {
-  if (!background) {
-    if (vertex_buffer != 0) {
-      glDeleteBuffers(1, &vertex_buffer);
-    }
-    if (fallback_shader_program != 0) {
-      glDeleteProgram(fallback_shader_program);
-    }
-  }
-}
-
-/* TODO move shaders to standalone .glsl file. */
-const char *FALLBACK_VERTEX_SHADER =
-    "#version 330\n"
-    "uniform vec2 fullscreen;\n"
-    "in vec2 texCoord;\n"
-    "in vec2 pos;\n"
-    "out vec2 texCoord_interp;\n"
-    "\n"
-    "vec2 normalize_coordinates()\n"
-    "{\n"
-    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-    "}\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-    "   texCoord_interp = texCoord;\n"
-    "}\n\0";
-
-const char *FALLBACK_FRAGMENT_SHADER =
-    "#version 330\n"
-    "uniform sampler2D image_texture;\n"
-    "in vec2 texCoord_interp;\n"
-    "out vec4 fragColor;\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   fragColor = texture(image_texture, texCoord_interp);\n"
-    "}\n\0";
-
-static void shader_print_errors(const char *task, const char *log, const char *code)
-{
-  LOG(ERROR) << "Shader: " << task << " error:";
-  LOG(ERROR) << "===== shader string ====";
-
-  stringstream stream(code);
-  string partial;
-
-  int line = 1;
-  while (getline(stream, partial, '\n')) {
-    if (line < 10) {
-      LOG(ERROR) << " " << line << " " << partial;
-    }
-    else {
-      LOG(ERROR) << line << " " << partial;
-    }
-    line++;
-  }
-  LOG(ERROR) << log;
-}
-
-static int bind_fallback_shader(void)
-{
-  GLint status;
-  GLchar log[5000];
-  GLsizei length = 0;
-  GLuint program = 0;
-
-  struct Shader {
-    const char *source;
-    GLenum type;
-  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
-                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
-
-  program = glCreateProgram();
-
-  for (int i = 0; i < 2; i++) {
-    GLuint shader = glCreateShader(shaders[i].type);
-
-    string source_str = shaders[i].source;
-    const char *c_str = source_str.c_str();
-
-    glShaderSource(shader, 1, &c_str, NULL);
-    glCompileShader(shader);
-
-    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
-
-    if (!status) {
-      glGetShaderInfoLog(shader, sizeof(log), &length, log);
-      shader_print_errors("compile", log, c_str);
-      return 0;
-    }
-
-    glAttachShader(program, shader);
-  }
-
-  /* Link output. */
-  glBindFragDataLocation(program, 0, "fragColor");
-
-  /* Link and error check. */
-  glLinkProgram(program);
-
-  glGetProgramiv(program, GL_LINK_STATUS, &status);
-  if (!status) {
-    glGetShaderInfoLog(program, sizeof(log), &length, log);
-    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
-    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
-    return 0;
-  }
-
-  return program;
-}
-
-bool Device::bind_fallback_display_space_shader(const float width, const float height)
-{
-  if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
-    return false;
-  }
-
-  if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
-    fallback_shader_program = bind_fallback_shader();
-    fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
-    if (fallback_shader_program == 0) {
-      return false;
-    }
-
-    glUseProgram(fallback_shader_program);
-    image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
-    if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
-      return false;
-    }
-
-    fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
-    if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
-      return false;
-    }
-
-    fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
-  }
-
-  /* Run this every time. */
-  glUseProgram(fallback_shader_program);
-  glUniform1i(image_texture_location, 0);
-  glUniform2f(fullscreen_location, width, height);
-  return true;
-}
-
-void Device::draw_pixels(device_memory &rgba,
-                         int y,
-                         int w,
-                         int h,
-                         int width,
-                         int height,
-                         int dx,
-                         int dy,
-                         int dw,
-                         int dh,
-                         bool transparent,
-                         const DeviceDrawParams &draw_params)
-{
-  const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
-  assert(rgba.type == MEM_PIXELS);
-  mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
-  GLuint texid;
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &texid);
-  glBindTexture(GL_TEXTURE_2D, texid);
-
-  if (rgba.data_type == TYPE_HALF) {
-    GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
-  }
-  else {
-    uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
-  }
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-  if (transparent) {
-    glEnable(GL_BLEND);
-    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-  }
-
-  GLint shader_program;
-  if (use_fallback_shader) {
-    if (!bind_fallback_display_space_shader(dw, dh)) {
-      return;
-    }
-    shader_program = fallback_shader_program;
-  }
-  else {
-    draw_params.bind_display_space_shader_cb();
-    glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-  }
-
-  if (!vertex_buffer) {
-    glGenBuffers(1, &vertex_buffer);
-  }
-
-  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
-   */
-  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-  float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-  if (vpointer) {
-    /* texture coordinate - vertex pair */
-    vpointer[0] = 0.0f;
-    vpointer[1] = 0.0f;
-    vpointer[2] = dx;
-    vpointer[3] = dy;
-
-    vpointer[4] = 1.0f;
-    vpointer[5] = 0.0f;
-    vpointer[6] = (float)width + dx;
-    vpointer[7] = dy;
-
-    vpointer[8] = 1.0f;
-    vpointer[9] = 1.0f;
-    vpointer[10] = (float)width + dx;
-    vpointer[11] = (float)height + dy;
-
-    vpointer[12] = 0.0f;
-    vpointer[13] = 1.0f;
-    vpointer[14] = dx;
-    vpointer[15] = (float)height + dy;
-
-    if (vertex_buffer) {
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-  }
-
-  GLuint vertex_array_object;
-  GLuint position_attribute, texcoord_attribute;
-
-  glGenVertexArrays(1, &vertex_array_object);
-  glBindVertexArray(vertex_array_object);
-
-  texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-  position_attribute = glGetAttribLocation(shader_program, "pos");
-
-  glEnableVertexAttribArray(texcoord_attribute);
-  glEnableVertexAttribArray(position_attribute);
-
-  glVertexAttribPointer(
-      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-  glVertexAttribPointer(position_attribute,
-                        2,
-                        GL_FLOAT,
-                        GL_FALSE,
-                        4 * sizeof(float),
-                        (const GLvoid *)(sizeof(float) * 2));
-
-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-  if (vertex_buffer) {
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-  }
-
-  if (use_fallback_shader) {
-    glUseProgram(0);
-  }
-  else {
-    draw_params.unbind_display_space_shader_cb();
-  }
-
-  glDeleteVertexArrays(1, &vertex_array_object);
-  glBindTexture(GL_TEXTURE_2D, 0);
-  glDeleteTextures(1, &texid);
-
-  if (transparent) {
-    glDisable(GL_BLEND);
-  }
 }
 
 void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
@@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
   }
 }
 
-Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
 #ifdef WITH_MULTI
   if (!info.multi_devices.empty()) {
     /* Always create a multi device when info contains multiple devices.
      * This is done so that the type can still be e.g. DEVICE_CPU to indicate
      * that it is a homogeneous collection of devices, which simplifies checks. */
-    return device_multi_create(info, stats, profiler, background);
+    return device_multi_create(info, stats, profiler);
   }
 #endif
 
@@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
 
   switch (info.type) {
     case DEVICE_CPU:
-      device = device_cpu_create(info, stats, profiler, background);
+      device = device_cpu_create(info, stats, profiler);
       break;
 #ifdef WITH_CUDA
     case DEVICE_CUDA:
       if (device_cuda_init())
-        device = device_cuda_create(info, stats, profiler, background);
+        device = device_cuda_create(info, stats, profiler);
       break;
 #endif
 #ifdef WITH_OPTIX
     case DEVICE_OPTIX:
       if (device_optix_init())
-        device = device_optix_create(info, stats, profiler, background);
-      break;
-#endif
-#ifdef WITH_NETWORK
-    case DEVICE_NETWORK:
-      device = device_network_create(info, stats, profiler, "127.0.0.1");
-      break;
-#endif
-#ifdef WITH_OPENCL
-    case DEVICE_OPENCL:
-      if (device_opencl_init())
-        device = device_opencl_create(info, stats, profiler, background);
+        device = device_optix_create(info, stats, profiler);
       break;
 #endif
     default:
@@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   }
 
   if (device == NULL) {
-    device = device_dummy_create(info, stats, profiler, background);
+    device = device_dummy_create(info, stats, profiler);
   }
 
   return device;
@@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_CUDA;
   else if (strcmp(name, "OPTIX") == 0)
     return DEVICE_OPTIX;
-  else if (strcmp(name, "OPENCL") == 0)
-    return DEVICE_OPENCL;
-  else if (strcmp(name, "NETWORK") == 0)
-    return DEVICE_NETWORK;
   else if (strcmp(name, "MULTI") == 0)
     return DEVICE_MULTI;
 
@@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type)
     return "CUDA";
   else if (type == DEVICE_OPTIX)
     return "OPTIX";
-  else if (type == DEVICE_OPENCL)
-    return "OPENCL";
-  else if (type == DEVICE_NETWORK)
-    return "NETWORK";
   else if (type == DEVICE_MULTI)
     return "MULTI";
 
@@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
   types.push_back(DEVICE_OPTIX);
 #endif
-#ifdef WITH_OPENCL
-  types.push_back(DEVICE_OPENCL);
-#endif
-#ifdef WITH_NETWORK
-  types.push_back(DEVICE_NETWORK);
-#endif
   return types;
 }
 
@@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   thread_scoped_lock lock(device_mutex);
   vector<DeviceInfo> devices;
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
-      if (device_opencl_init()) {
-        device_opencl_info(opencl_devices);
-      }
-      devices_initialized_mask |= DEVICE_MASK_OPENCL;
-    }
-    foreach (DeviceInfo &info, opencl_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
 #if defined(WITH_CUDA) || defined(WITH_OPTIX)
   if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
     if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
@@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
     }
   }
 
-#ifdef WITH_NETWORK
-  if (mask & DEVICE_MASK_NETWORK) {
-    if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
-      device_network_info(network_devices);
-      devices_initialized_mask |= DEVICE_MASK_NETWORK;
-    }
-    foreach (DeviceInfo &info, network_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
   return devices;
 }
 
@@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask)
     capabilities += device_cpu_capabilities() + "\n";
   }
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (device_opencl_init()) {
-      capabilities += "\nOpenCL device capabilities:\n";
-      capabilities += device_opencl_capabilities();
-    }
-  }
-#endif
-
 #ifdef WITH_CUDA
   if (mask & DEVICE_MASK_CUDA) {
     if (device_cuda_init()) {
@@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   }
 
   DeviceInfo info;
-  info.type = subdevices.front().type;
+  info.type = DEVICE_NONE;
   info.id = "MULTI";
   info.description = "Multi Device";
   info.num = 0;
 
   info.has_half_images = true;
   info.has_nanovdb = true;
-  info.has_volume_decoupled = true;
-  info.has_branched_path = true;
-  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
   info.has_peer_memory = false;
@@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.id += device.id;
 
     /* Set device type to MULTI if subdevices are not of a common type. */
-    if (device.type != info.type) {
+    if (info.type == DEVICE_NONE) {
+      info.type = device.type;
+    }
+    else if (device.type != info.type) {
       info.type = DEVICE_MULTI;
     }
 
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_nanovdb &= device.has_nanovdb;
-    info.has_volume_decoupled &= device.has_volume_decoupled;
-    info.has_branched_path &= device.has_branched_path;
-    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
     info.has_peer_memory |= device.has_peer_memory;
@@ -689,60 +315,32 @@ void Device::free_memory()
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
   optix_devices.free_memory();
-  opencl_devices.free_memory();
   cpu_devices.free_memory();
-  network_devices.free_memory();
 }
 
-/* DeviceInfo */
-
-void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+unique_ptr<DeviceQueue> Device::gpu_queue_create()
 {
-  assert(denoising_devices.empty());
-
-  if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
-    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
-    if (!optix_devices.empty()) {
-      /* Convert to a special multi device with separate denoising devices. */
-      if (multi_devices.empty()) {
-        multi_devices.push_back(*this);
-      }
-
-      /* Try to use the same physical devices for denoising. */
-      for (const DeviceInfo &cuda_device : multi_devices) {
-        if (cuda_device.type == DEVICE_CUDA) {
-          for (const DeviceInfo &optix_device : optix_devices) {
-            if (cuda_device.num == optix_device.num) {
-              id += optix_device.id;
-              denoising_devices.push_back(optix_device);
-              break;
-            }
-          }
-        }
-      }
-
-      if (denoising_devices.empty()) {
-        /* Simply use the first available OptiX device. */
-        const DeviceInfo optix_device = optix_devices.front();
-        id += optix_device.id; /* Uniquely identify this special multi device. */
-        denoising_devices.push_back(optix_device);
-      }
+  LOG(FATAL) << "Device does not support queues.";
+  return nullptr;
+}
 
-      denoisers = denoiser_type;
-    }
-  }
-  else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
-    /* Convert to a special multi device with separate denoising devices. */
-    if (multi_devices.empty()) {
-      multi_devices.push_back(*this);
-    }
+const CPUKernels *Device::get_cpu_kernels() const
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+  return nullptr;
+}
 
-    /* Add CPU denoising devices. */
-    DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
-    denoising_devices.push_back(cpu_device);
+void Device::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+}
 
-    denoisers = denoiser_type;
-  }
+void *Device::get_cpu_osl_memory()
+{
+  return nullptr;
 }
 
+/* DeviceInfo */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ecf79bcdfa6..02b6edb56d0 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -21,31 +21,34 @@
 
 #include "bvh/bvh_params.h"
 
+#include "device/device_denoise.h"
 #include "device/device_memory.h"
-#include "device/device_task.h"
 
+#include "util/util_function.h"
 #include "util/util_list.h"
+#include "util/util_logging.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_texture.h"
 #include "util/util_thread.h"
 #include "util/util_types.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class BVH;
+class DeviceQueue;
 class Progress;
-class RenderTile;
+class CPUKernels;
+class CPUKernelThreadGlobals;
 
 /* Device Types */
 
 enum DeviceType {
   DEVICE_NONE = 0,
   DEVICE_CPU,
-  DEVICE_OPENCL,
   DEVICE_CUDA,
-  DEVICE_NETWORK,
   DEVICE_MULTI,
   DEVICE_OPTIX,
   DEVICE_DUMMY,
@@ -53,20 +56,11 @@ enum DeviceType {
 
 enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
-  DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
-  DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
   DEVICE_MASK_ALL = ~0
 };
 
-enum DeviceKernelStatus {
-  DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
-  DEVICE_KERNEL_USING_FEATURE_KERNEL,
-  DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
-  DEVICE_KERNEL_UNKNOWN,
-};
-
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
 
 class DeviceInfo {
@@ -75,20 +69,16 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;               /* GPU is used as a display device. */
-  bool has_half_images;              /* Support half-float textures. */
-  bool has_nanovdb;                  /* Support NanoVDB volumes. */
-  bool has_volume_decoupled;         /* Decoupled volume shading. */
-  bool has_branched_path;            /* Supports branched path tracing. */
-  bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
-  bool has_osl;                      /* Support Open Shading Language. */
-  bool use_split_kernel;             /* Use split or mega kernel. */
-  bool has_profiling;                /* Supports runtime collection of profiling info. */
-  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
-  DenoiserTypeMask denoisers;        /* Supported denoiser types. */
+  bool display_device;        /* GPU is used as a display device. */
+  bool has_nanovdb;           /* Support NanoVDB volumes. */
+  bool has_half_images;       /* Support half-float textures. */
+  bool has_osl;               /* Support Open Shading Language. */
+  bool has_profiling;         /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;         /* Device supports GPU queue. */
+  DenoiserTypeMask denoisers; /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
-  vector<DeviceInfo> denoising_devices;
   string error_msg;
 
   DeviceInfo()
@@ -100,227 +90,35 @@ class DeviceInfo {
     display_device = false;
     has_half_images = false;
     has_nanovdb = false;
-    has_volume_decoupled = false;
-    has_branched_path = true;
-    has_adaptive_stop_per_sample = false;
     has_osl = false;
-    use_split_kernel = false;
     has_profiling = false;
     has_peer_memory = false;
+    has_gpu_queue = false;
     denoisers = DENOISER_NONE;
   }
 
-  bool operator==(const DeviceInfo &info)
+  bool operator==(const DeviceInfo &info) const
   {
     /* Multiple Devices with the same ID would be very bad. */
     assert(id != info.id ||
            (type == info.type && num == info.num && description == info.description));
     return id == info.id;
   }
-
-  /* Add additional devices needed for the specified denoiser. */
-  void add_denoising_devices(DenoiserType denoiser_type);
-};
-
-class DeviceRequestedFeatures {
- public:
-  /* Use experimental feature set. */
-  bool experimental;
-
-  /* Selective nodes compilation. */
-
-  /* Identifier of a node group up to which all the nodes needs to be
-   * compiled in. Nodes from higher group indices will be ignores.
-   */
-  int max_nodes_group;
-
-  /* Features bitfield indicating which features from the requested group
-   * will be compiled in. Nodes which corresponds to features which are not
-   * in this bitfield will be ignored even if they're in the requested group.
-   */
-  int nodes_features;
-
-  /* BVH/sampling kernel features. */
-  bool use_hair;
-  bool use_hair_thick;
-  bool use_object_motion;
-  bool use_camera_motion;
-
-  /* Denotes whether baking functionality is needed. */
-  bool use_baking;
-
-  /* Use subsurface scattering materials. */
-  bool use_subsurface;
-
-  /* Use volume materials. */
-  bool use_volume;
-
-  /* Use branched integrator. */
-  bool use_integrator_branched;
-
-  /* Use OpenSubdiv patch evaluation */
-  bool use_patch_evaluation;
-
-  /* Use Transparent shadows */
-  bool use_transparent;
-
-  /* Use various shadow tricks, such as shadow catcher. */
-  bool use_shadow_tricks;
-
-  /* Per-uber shader usage flags. */
-  bool use_principled;
-
-  /* Denoising features. */
-  bool use_denoising;
-
-  /* Use raytracing in shaders. */
-  bool use_shader_raytrace;
-
-  /* Use true displacement */
-  bool use_true_displacement;
-
-  /* Use background lights */
-  bool use_background_light;
-
-  DeviceRequestedFeatures()
-  {
-    /* TODO(sergey): Find more meaningful defaults. */
-    max_nodes_group = 0;
-    nodes_features = 0;
-    use_hair = false;
-    use_hair_thick = false;
-    use_object_motion = false;
-    use_camera_motion = false;
-    use_baking = false;
-    use_subsurface = false;
-    use_volume = false;
-    use_integrator_branched = false;
-    use_patch_evaluation = false;
-    use_transparent = false;
-    use_shadow_tricks = false;
-    use_principled = false;
-    use_denoising = false;
-    use_shader_raytrace = false;
-    use_true_displacement = false;
-    use_background_light = false;
-  }
-
-  bool modified(const DeviceRequestedFeatures &requested_features)
-  {
-    return !(max_nodes_group == requested_features.max_nodes_group &&
-             nodes_features == requested_features.nodes_features &&
-             use_hair == requested_features.use_hair &&
-             use_hair_thick == requested_features.use_hair_thick &&
-             use_object_motion == requested_features.use_object_motion &&
-             use_camera_motion == requested_features.use_camera_motion &&
-             use_baking == requested_features.use_baking &&
-             use_subsurface == requested_features.use_subsurface &&
-             use_volume == requested_features.use_volume &&
-             use_integrator_branched == requested_features.use_integrator_branched &&
-             use_patch_evaluation == requested_features.use_patch_evaluation &&
-             use_transparent == requested_features.use_transparent &&
-             use_shadow_tricks == requested_features.use_shadow_tricks &&
-             use_principled == requested_features.use_principled &&
-             use_denoising == requested_features.use_denoising &&
-             use_shader_raytrace == requested_features.use_shader_raytrace &&
-             use_true_displacement == requested_features.use_true_displacement &&
-             use_background_light == requested_features.use_background_light);
-  }
-
-  /* Convert the requested features structure to a build options,
-   * which could then be passed to compilers.
-   */
-  string get_build_options() const
-  {
-    string build_options = "";
-    if (experimental) {
-      build_options += "-D__KERNEL_EXPERIMENTAL__ ";
-    }
-    build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
-    build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
-    if (!use_hair) {
-      build_options += " -D__NO_HAIR__";
-    }
-    if (!use_object_motion) {
-      build_options += " -D__NO_OBJECT_MOTION__";
-    }
-    if (!use_camera_motion) {
-      build_options += " -D__NO_CAMERA_MOTION__";
-    }
-    if (!use_baking) {
-      build_options += " -D__NO_BAKING__";
-    }
-    if (!use_volume) {
-      build_options += " -D__NO_VOLUME__";
-    }
-    if (!use_subsurface) {
-      build_options += " -D__NO_SUBSURFACE__";
-    }
-    if (!use_integrator_branched) {
-      build_options += " -D__NO_BRANCHED_PATH__";
-    }
-    if (!use_patch_evaluation) {
-      build_options += " -D__NO_PATCH_EVAL__";
-    }
-    if (!use_transparent && !use_volume) {
-      build_options += " -D__NO_TRANSPARENT__";
-    }
-    if (!use_shadow_tricks) {
-      build_options += " -D__NO_SHADOW_TRICKS__";
-    }
-    if (!use_principled) {
-      build_options += " -D__NO_PRINCIPLED__";
-    }
-    if (!use_denoising) {
-      build_options += " -D__NO_DENOISING__";
-    }
-    if (!use_shader_raytrace) {
-      build_options += " -D__NO_SHADER_RAYTRACE__";
-    }
-    return build_options;
-  }
 };
 
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
-
 /* Device */
 
-struct DeviceDrawParams {
-  function<void()> bind_display_space_shader_cb;
-  function<void()> unbind_display_space_shader_cb;
-};
-
 class Device {
   friend class device_sub_ptr;
 
  protected:
-  enum {
-    FALLBACK_SHADER_STATUS_NONE = 0,
-    FALLBACK_SHADER_STATUS_ERROR,
-    FALLBACK_SHADER_STATUS_SUCCESS,
-  };
-
-  Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
-      : background(background),
-        vertex_buffer(0),
-        fallback_status(FALLBACK_SHADER_STATUS_NONE),
-        fallback_shader_program(0),
-        info(info_),
-        stats(stats_),
-        profiler(profiler_)
+  Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : info(info_), stats(stats_), profiler(profiler_)
   {
   }
 
-  bool background;
   string error_msg;
 
-  /* used for real time display */
-  unsigned int vertex_buffer;
-  int fallback_status, fallback_shader_program;
-  int image_texture_location, fullscreen_location;
-
-  bool bind_fallback_display_space_shader(const float width, const float height);
-
   virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
   {
     /* Only required for devices that implement denoising. */
@@ -361,67 +159,31 @@ class Device {
   Stats &stats;
   Profiler &profiler;
 
-  /* memory alignment */
-  virtual int mem_sub_ptr_alignment()
-  {
-    return MIN_ALIGNMENT_CPU_DATA_TYPES;
-  }
-
   /* constant memory */
   virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
-  /* open shading language, only for CPU device */
-  virtual void *osl_memory()
-  {
-    return NULL;
-  }
-
   /* load/compile kernels, must be called before adding tasks */
-  virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+  virtual bool load_kernels(uint /*kernel_features*/)
   {
     return true;
   }
 
-  /* Wait for device to become available to upload data and receive tasks
-   * This method is used by the OpenCL device to load the
-   * optimized kernels or when not (yet) available load the
-   * generic kernels (only during foreground rendering) */
-  virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    return true;
-  }
-  /* Check if there are 'better' kernels available to be used
-   * We can switch over to these kernels
-   * This method is used to determine if we can switch the preview kernels
-   * to regular kernels */
-  virtual DeviceKernelStatus get_active_kernel_switch_state()
-  {
-    return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-  }
+  /* GPU device only functions.
+   * These may not be used on CPU or multi-devices. */
 
-  /* tasks */
-  virtual int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
+  /* Create new queue for executing kernels in. */
+  virtual unique_ptr<DeviceQueue> gpu_queue_create();
+
+  /* CPU device only functions.
+   * These may not be used on GPU or multi-devices. */
 
-  virtual void task_add(DeviceTask &task) = 0;
-  virtual void task_wait() = 0;
-  virtual void task_cancel() = 0;
-
-  /* opengl drawing */
-  virtual void draw_pixels(device_memory &mem,
-                           int y,
-                           int w,
-                           int h,
-                           int width,
-                           int height,
-                           int dx,
-                           int dy,
-                           int dw,
-                           int dh,
-                           bool transparent,
-                           const DeviceDrawParams &draw_params);
+  /* Get CPU kernel functions for native instruction set. */
+  virtual const CPUKernels *get_cpu_kernels() const;
+  /* Get kernel globals to pass to kernels. */
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+  /* Get OpenShadingLanguage memory buffer. */
+  virtual void *get_cpu_osl_memory();
 
   /* acceleration structure building */
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
@@ -429,25 +191,11 @@ class Device {
   /* OptiX specific destructor. */
   virtual void release_optix_bvh(BVH * /*bvh*/){};
 
-#ifdef WITH_NETWORK
-  /* networking */
-  void server_run();
-#endif
-
   /* multi device */
-  virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
-  {
-  }
   virtual int device_number(Device * /*sub_device*/)
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
 
   virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
   {
@@ -460,11 +208,47 @@ class Device {
     return false;
   }
 
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Check display si to be updated using graphics interoperability.
+   * The interoperability can not be used is it is not supported by the device. But the device
+   * might also force disable the interoperability if it detects that it will be slower than
+   * copying pixels from the render buffer. */
+  virtual bool should_use_graphics_interop()
+  {
+    return false;
+  }
+
+  /* Buffer denoising. */
+
+  /* Returns true if task is fully handled. */
+  virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
+  {
+    LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
+    return false;
+  }
+
+  virtual DeviceQueue *get_denoise_queue()
+  {
+    LOG(ERROR) << "Request denoising queue from a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Sub-devices */
+
+  /* Run given callback for every individual device which will be handling rendering.
+   * For the single device the callback is called for the device itself. For the multi-device the
+   * callback is only called for the sub-devices. */
+  virtual void foreach_device(const function<void(Device *)> &callback)
+  {
+    callback(this);
+  }
+
   /* static */
-  static Device *create(DeviceInfo &info,
-                        Stats &stats,
-                        Profiler &profiler,
-                        bool background = true);
+  static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
 
   static DeviceType type_from_string(const char *name);
   static string string_from_type(DeviceType type);
@@ -499,9 +283,7 @@ class Device {
   static thread_mutex device_mutex;
   static vector<DeviceInfo> cuda_devices;
   static vector<DeviceInfo> optix_devices;
-  static vector<DeviceInfo> opencl_devices;
   static vector<DeviceInfo> cpu_devices;
-  static vector<DeviceInfo> network_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
deleted file mode 100644
index 4a6e77d6eaa..00000000000
--- a/intern/cycles/device/device_cpu.cpp
+++ /dev/null
@@ -1,1680 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* So ImathMath is included before our kernel_cpu_compat. */
-#ifdef WITH_OSL
-/* So no context pollution happens from indirectly included windows.h */
-#  include "util/util_windows.h"
-#  include <OSL/oslexec.h>
-#endif
-
-#ifdef WITH_EMBREE
-#  include <embree3/rtcore.h>
-#endif
-
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
-
-// clang-format off
-#include "kernel/kernel.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-#include "kernel/filter/filter.h"
-
-#include "kernel/osl/osl_shader.h"
-#include "kernel/osl/osl_globals.h"
-// clang-format on
-
-#include "bvh/bvh_embree.h"
-
-#include "render/buffers.h"
-#include "render/coverage.h"
-
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_function.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_opengl.h"
-#include "util/util_openimagedenoise.h"
-#include "util/util_optimization.h"
-#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_thread.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CPUDevice;
-
-/* Has to be outside of the class to be shared across template instantiations. */
-static const char *logged_architecture = "";
-
-template<typename F> class KernelFunctions {
- public:
-  KernelFunctions()
-  {
-    kernel = (F)NULL;
-  }
-
-  KernelFunctions(
-      F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
-  {
-    const char *architecture_name = "default";
-    kernel = kernel_default;
-
-    /* Silence potential warnings about unused variables
-     * when compiling without some architectures. */
-    (void)kernel_sse2;
-    (void)kernel_sse3;
-    (void)kernel_sse41;
-    (void)kernel_avx;
-    (void)kernel_avx2;
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-      architecture_name = "AVX2";
-      kernel = kernel_avx2;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-        if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
-      architecture_name = "AVX";
-      kernel = kernel_avx;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-        if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
-      architecture_name = "SSE4.1";
-      kernel = kernel_sse41;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-        if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
-      architecture_name = "SSE3";
-      kernel = kernel_sse3;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-        if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      architecture_name = "SSE2";
-      kernel = kernel_sse2;
-    }
-#else
-    {
-      /* Dummy to prevent the architecture if below become
-       * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-       * is not defined. */
-    }
-#endif
-
-    if (strcmp(architecture_name, logged_architecture) != 0) {
-      VLOG(1) << "Will be using " << architecture_name << " kernels.";
-      logged_architecture = architecture_name;
-    }
-  }
-
-  inline F operator()() const
-  {
-    assert(kernel);
-    return kernel;
-  }
-
- protected:
-  F kernel;
-};
-
-class CPUSplitKernel : public DeviceSplitKernel {
-  CPUDevice *device;
-
- public:
-  explicit CPUSplitKernel(CPUDevice *device);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-};
-
-class CPUDevice : public Device {
- public:
-  TaskPool task_pool;
-  KernelGlobals kernel_globals;
-
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-#ifdef WITH_OSL
-  OSLGlobals osl_globals;
-#endif
-#ifdef WITH_OPENIMAGEDENOISE
-  oidn::DeviceRef oidn_device;
-  oidn::FilterRef oidn_filter;
-#endif
-  thread_spin_lock oidn_task_lock;
-#ifdef WITH_EMBREE
-  RTCScene embree_scene = NULL;
-  RTCDevice embree_device;
-#endif
-
-  bool use_split_kernel;
-
-  DeviceRequestedFeatures requested_features;
-
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_half_float_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_byte_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
-      shader_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
-
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
-      filter_divide_shadow_kernel;
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
-      filter_get_feature_kernel;
-  KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
-      filter_write_feature_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_detect_outliers_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_combine_halves_kernel;
-
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
-      filter_nlm_calc_difference_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
-      filter_nlm_update_output_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
-
-  KernelFunctions<void (*)(
-      float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
-      filter_construct_transform_kernel;
-  KernelFunctions<void (*)(int,
-                           int,
-                           int,
-                           float *,
-                           float *,
-                           float *,
-                           int *,
-                           float *,
-                           float3 *,
-                           int *,
-                           int *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           bool)>
-      filter_nlm_construct_gramian_kernel;
-  KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
-      filter_finalize_kernel;
-
-  KernelFunctions<void (*)(KernelGlobals *,
-                           ccl_constant KernelData *,
-                           ccl_global void *,
-                           int,
-                           ccl_global char *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           ccl_global int *,
-                           int,
-                           ccl_global char *,
-                           ccl_global unsigned int *,
-                           unsigned int,
-                           ccl_global float *)>
-      data_init_kernel;
-  unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
-
-#define KERNEL_FUNCTIONS(name) \
-  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
-      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
-      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
-
-  CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_),
-        texture_info(this, "__texture_info", MEM_GLOBAL),
-#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
-        REGISTER_KERNEL(path_trace),
-        REGISTER_KERNEL(convert_to_half_float),
-        REGISTER_KERNEL(convert_to_byte),
-        REGISTER_KERNEL(shader),
-        REGISTER_KERNEL(bake),
-        REGISTER_KERNEL(filter_divide_shadow),
-        REGISTER_KERNEL(filter_get_feature),
-        REGISTER_KERNEL(filter_write_feature),
-        REGISTER_KERNEL(filter_detect_outliers),
-        REGISTER_KERNEL(filter_combine_halves),
-        REGISTER_KERNEL(filter_nlm_calc_difference),
-        REGISTER_KERNEL(filter_nlm_blur),
-        REGISTER_KERNEL(filter_nlm_calc_weight),
-        REGISTER_KERNEL(filter_nlm_update_output),
-        REGISTER_KERNEL(filter_nlm_normalize),
-        REGISTER_KERNEL(filter_construct_transform),
-        REGISTER_KERNEL(filter_nlm_construct_gramian),
-        REGISTER_KERNEL(filter_finalize),
-        REGISTER_KERNEL(data_init)
-#undef REGISTER_KERNEL
-  {
-    if (info.cpu_threads == 0) {
-      info.cpu_threads = TaskScheduler::num_threads();
-    }
-
-#ifdef WITH_OSL
-    kernel_globals.osl = &osl_globals;
-#endif
-#ifdef WITH_EMBREE
-    embree_device = rtcNewDevice("verbose=0");
-#endif
-    use_split_kernel = DebugFlags().cpu.split_kernel;
-    if (use_split_kernel) {
-      VLOG(1) << "Will be using split kernel.";
-    }
-    need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) \
-  split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
-      KERNEL_FUNCTIONS(name))
-    REGISTER_SPLIT_KERNEL(path_init);
-    REGISTER_SPLIT_KERNEL(scene_intersect);
-    REGISTER_SPLIT_KERNEL(lamp_emission);
-    REGISTER_SPLIT_KERNEL(do_volume);
-    REGISTER_SPLIT_KERNEL(queue_enqueue);
-    REGISTER_SPLIT_KERNEL(indirect_background);
-    REGISTER_SPLIT_KERNEL(shader_setup);
-    REGISTER_SPLIT_KERNEL(shader_sort);
-    REGISTER_SPLIT_KERNEL(shader_eval);
-    REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
-    REGISTER_SPLIT_KERNEL(subsurface_scatter);
-    REGISTER_SPLIT_KERNEL(direct_lighting);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
-    REGISTER_SPLIT_KERNEL(enqueue_inactive);
-    REGISTER_SPLIT_KERNEL(next_iteration_setup);
-    REGISTER_SPLIT_KERNEL(indirect_subsurface);
-    REGISTER_SPLIT_KERNEL(buffer_update);
-    REGISTER_SPLIT_KERNEL(adaptive_stopping);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
-    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
-#undef REGISTER_SPLIT_KERNEL
-#undef KERNEL_FUNCTIONS
-  }
-
-  ~CPUDevice()
-  {
-#ifdef WITH_EMBREE
-    rtcReleaseDevice(embree_device);
-#endif
-    task_pool.cancel();
-    texture_info.free();
-  }
-
-  virtual bool show_samples() const override
-  {
-    return (info.cpu_threads == 1);
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-#ifdef WITH_EMBREE
-    bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
-    return bvh_layout_mask;
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  virtual void mem_alloc(device_memory &mem) override
-  {
-    if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else if (mem.type == MEM_GLOBAL) {
-      assert(!"mem_alloc not supported for global memory.");
-    }
-    else {
-      if (mem.name) {
-        VLOG(1) << "Buffer allocate: " << mem.name << ", "
-                << string_human_readable_number(mem.memory_size()) << " bytes. ("
-                << string_human_readable_size(mem.memory_size()) << ")";
-      }
-
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
-        void *data = util_aligned_malloc(mem.memory_size(), alignment);
-        mem.device_pointer = (device_ptr)data;
-      }
-      else {
-        mem.device_pointer = (device_ptr)mem.host_pointer;
-      }
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-    }
-  }
-
-  virtual void mem_copy_to(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-      global_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-      tex_alloc((device_texture &)mem);
-    }
-    else if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else {
-      if (!mem.device_pointer) {
-        mem_alloc(mem);
-      }
-
-      /* copy is no-op */
-    }
-  }
-
-  virtual void mem_copy_from(
-      device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
-  {
-    /* no-op */
-  }
-
-  virtual void mem_zero(device_memory &mem) override
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    if (mem.device_pointer) {
-      memset((void *)mem.device_pointer, 0, mem.memory_size());
-    }
-  }
-
-  virtual void mem_free(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-    }
-    else if (mem.device_pointer) {
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        util_aligned_free((void *)mem.device_pointer);
-      }
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override
-  {
-#if WITH_EMBREE
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update scene handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      data->bvh.scene = embree_scene;
-    }
-#endif
-    kernel_const_copy(&kernel_globals, name, host, size);
-  }
-
-  void global_alloc(device_memory &mem)
-  {
-    VLOG(1) << "Global memory allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void global_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void tex_alloc(device_texture &mem)
-  {
-    VLOG(1) << "Texture allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    const uint slot = mem.slot;
-    if (slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount of re-allocations. */
-      texture_info.resize(slot + 128);
-    }
-
-    texture_info[slot] = mem.info;
-    texture_info[slot].data = (uint64_t)mem.host_pointer;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_texture &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-      need_texture_info = true;
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-#ifdef WITH_OSL
-    return &osl_globals;
-#else
-    return NULL;
-#endif
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-#ifdef WITH_EMBREE
-    if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
-        bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
-      BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
-      if (refit) {
-        bvh_embree->refit(progress);
-      }
-      else {
-        bvh_embree->build(progress, &stats, embree_device);
-      }
-
-      if (bvh->params.top_level) {
-        embree_scene = bvh_embree->scene;
-      }
-    }
-    else
-#endif
-      Device::build_bvh(bvh, progress, refit);
-  }
-
-  void thread_run(DeviceTask &task)
-  {
-    if (task.type == DeviceTask::RENDER)
-      thread_render(task);
-    else if (task.type == DeviceTask::SHADER)
-      thread_shader(task);
-    else if (task.type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(task);
-    else if (task.type == DeviceTask::DENOISE_BUFFER)
-      thread_denoise(task);
-  }
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
-    int4 rect = task->rect;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int w = align_up(rect.z - rect.x, 4);
-    int h = rect.w - rect.y;
-    int stride = task->buffer.stride;
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *blurDifference = temporary_mem;
-    float *difference = temporary_mem + task->buffer.pass_stride;
-    float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
-
-    memset(weightAccum, 0, sizeof(float) * w * h);
-    memset((float *)out_ptr, 0, sizeof(float) * w * h);
-
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {
-          max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)guide_ptr,
-                                          (float *)variance_ptr,
-                                          NULL,
-                                          difference,
-                                          local_rect,
-                                          w,
-                                          channel_offset,
-                                          0,
-                                          a,
-                                          k_2);
-
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-      filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-
-      filter_nlm_update_output_kernel()(dx,
-                                        dy,
-                                        blurDifference,
-                                        (float *)image_ptr,
-                                        difference,
-                                        (float *)out_ptr,
-                                        weightAccum,
-                                        local_rect,
-                                        channel_offset,
-                                        stride,
-                                        f);
-    }
-
-    int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
-    filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
-
-    return true;
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
-                                            task->tile_info,
-                                            x + task->filter_area.x,
-                                            y + task->filter_area.y,
-                                            y * task->filter_area.z + x,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            &task->rect.x,
-                                            task->buffer.pass_stride,
-                                            task->buffer.frame_stride,
-                                            task->buffer.use_time,
-                                            task->radius,
-                                            task->pca_threshold);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *difference = temporary_mem;
-    float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
-    int r = task->radius;
-    int frame_offset = frame * task->buffer.frame_stride;
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {max(0, -dx),
-                           max(0, -dy),
-                           task->reconstruction_state.source_w - max(0, dx),
-                           task->reconstruction_state.source_h - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)color_ptr,
-                                          (float *)color_variance_ptr,
-                                          (float *)scale_ptr,
-                                          difference,
-                                          local_rect,
-                                          task->buffer.stride,
-                                          task->buffer.pass_stride,
-                                          frame_offset,
-                                          1.0f,
-                                          task->nlm_k_2);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_calc_weight_kernel()(
-          blurDifference, difference, local_rect, task->buffer.stride, 4);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_construct_gramian_kernel()(dx,
-                                            dy,
-                                            task->tile_info->frames[frame],
-                                            blurDifference,
-                                            (float *)task->buffer.mem.device_pointer,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            (float *)task->storage.XtWX.device_pointer,
-                                            (float3 *)task->storage.XtWY.device_pointer,
-                                            local_rect,
-                                            &task->reconstruction_state.filter_window.x,
-                                            task->buffer.stride,
-                                            4,
-                                            task->buffer.pass_stride,
-                                            frame_offset,
-                                            task->buffer.use_time);
-    }
-
-    return true;
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_finalize_kernel()(x,
-                                 y,
-                                 y * task->filter_area.z + x,
-                                 (float *)output_ptr,
-                                 (int *)task->storage.rank.device_pointer,
-                                 (float *)task->storage.XtWX.device_pointer,
-                                 (float3 *)task->storage.XtWY.device_pointer,
-                                 &task->reconstruction_state.buffer_params.x,
-                                 task->render_buffer.samples);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = rect.x; x < rect.z; x++) {
-        filter_combine_halves_kernel()(x,
-                                       y,
-                                       (float *)mean_ptr,
-                                       (float *)variance_ptr,
-                                       (float *)a_ptr,
-                                       (float *)b_ptr,
-                                       &rect.x,
-                                       r);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_divide_shadow_kernel()(task->render_buffer.samples,
-                                      task->tile_info,
-                                      x,
-                                      y,
-                                      (float *)a_ptr,
-                                      (float *)b_ptr,
-                                      (float *)sample_variance_ptr,
-                                      (float *)sv_variance_ptr,
-                                      (float *)buffer_variance_ptr,
-                                      &task->rect.x,
-                                      task->render_buffer.pass_stride,
-                                      task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_get_feature_kernel()(task->render_buffer.samples,
-                                    task->tile_info,
-                                    mean_offset,
-                                    variance_offset,
-                                    x,
-                                    y,
-                                    (float *)mean_ptr,
-                                    (float *)variance_ptr,
-                                    scale,
-                                    &task->rect.x,
-                                    task->render_buffer.pass_stride,
-                                    task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_write_feature_kernel()(task->render_buffer.samples,
-                                      x + task->filter_area.x,
-                                      y + task->filter_area.y,
-                                      &task->reconstruction_state.buffer_params.x,
-                                      (float *)from_ptr,
-                                      (float *)buffer_ptr,
-                                      out_offset,
-                                      &task->rect.x);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_detect_outliers_kernel()(x,
-                                        y,
-                                        (float *)image_ptr,
-                                        (float *)variance_ptr,
-                                        (float *)depth_ptr,
-                                        (float *)output_ptr,
-                                        &task->rect.x,
-                                        task->buffer.pass_stride);
-      }
-    }
-    return true;
-  }
-
-  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
-  {
-    WorkTile wtile;
-    wtile.x = tile.x;
-    wtile.y = tile.y;
-    wtile.w = tile.w;
-    wtile.h = tile.h;
-    wtile.offset = tile.offset;
-    wtile.stride = tile.stride;
-    wtile.buffer = (float *)tile.buffer;
-
-    /* For CPU we do adaptive stopping per sample so we can stop earlier, but
-     * for combined CPU + GPU rendering we match the GPU and do it per tile
-     * after a given number of sample steps. */
-    if (!kernel_data.integrator.adaptive_stop_per_sample) {
-      for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-        for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-          const int index = wtile.offset + x + y * wtile.stride;
-          float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
-          kernel_do_adaptive_stopping(kg, buffer, sample);
-        }
-      }
-    }
-
-    bool any = false;
-    for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
-    }
-    for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
-    }
-    return (!any);
-  }
-
-  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
-  {
-    float *render_buffer = (float *)tile.buffer;
-    for (int y = tile.y; y < tile.y + tile.h; y++) {
-      for (int x = tile.x; x < tile.x + tile.w; x++) {
-        int index = tile.offset + x + y * tile.stride;
-        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
-        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-          float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
-          if (sample_multiplier != 1.0f) {
-            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-          }
-        }
-        else {
-          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
-        }
-      }
-    }
-  }
-
-  void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-  {
-    const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-    scoped_timer timer(&tile.buffers->render_time);
-
-    Coverage coverage(kg, tile);
-    if (use_coverage) {
-      coverage.init_path_trace();
-    }
-
-    float *render_buffer = (float *)tile.buffer;
-    int start_sample = tile.start_sample;
-    int end_sample = tile.start_sample + tile.num_samples;
-
-    /* Needed for Embree. */
-    SIMD_SET_FLUSH_TO_ZERO;
-
-    for (int sample = start_sample; sample < end_sample; sample++) {
-      if (task.get_cancel() || TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-
-      if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
-        tile.stealing_state = RenderTile::WAS_STOLEN;
-        break;
-      }
-
-      if (tile.task == RenderTile::PATH_TRACE) {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            if (use_coverage) {
-              coverage.init_pixel(x, y);
-            }
-            path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      else {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      tile.sample = sample + 1;
-
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-        const bool stop = adaptive_sampling_filter(kg, tile, sample);
-        if (stop) {
-          const int num_progress_samples = end_sample - sample;
-          tile.sample = end_sample;
-          task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
-          break;
-        }
-      }
-
-      task.update_progress(&tile, tile.w * tile.h);
-    }
-    if (use_coverage) {
-      coverage.finalize();
-    }
-
-    if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
-      adaptive_sampling_post(tile, kg);
-    }
-  }
-
-  void denoise_openimagedenoise_buffer(DeviceTask &task,
-                                       float *buffer,
-                                       const size_t offset,
-                                       const size_t stride,
-                                       const size_t x,
-                                       const size_t y,
-                                       const size_t w,
-                                       const size_t h,
-                                       const float scale)
-  {
-#ifdef WITH_OPENIMAGEDENOISE
-    assert(openimagedenoise_supported());
-
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
-     * buffers, and for tiled rendering because creating multiple devices and filters
-     * is slow and memory hungry as well.
-     *
-     * TODO: optimize tiled rendering case, by batching together denoising of many
-     * tiles somehow? */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    /* Create device and filter, cached for reuse. */
-    if (!oidn_device) {
-      oidn_device = oidn::newDevice();
-      oidn_device.commit();
-    }
-    if (!oidn_filter) {
-      oidn_filter = oidn_device.newFilter("RT");
-      oidn_filter.set("hdr", true);
-      oidn_filter.set("srgb", false);
-    }
-
-    /* Set images with appropriate stride for our interleaved pass storage. */
-    struct {
-      const char *name;
-      const int offset;
-      const bool scale;
-      const bool use;
-      array<float> scaled_buffer;
-    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
-                  {"albedo",
-                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
-                  {"normal",
-                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
-                  {"output", 0, false, true},
-                  { NULL,
-                    0 }};
-
-    for (int i = 0; passes[i].name; i++) {
-      if (!passes[i].use) {
-        continue;
-      }
-
-      const int64_t pixel_offset = offset + x + y * stride;
-      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
-      const int64_t pixel_stride = task.pass_stride;
-      const int64_t row_stride = stride * pixel_stride;
-
-      if (passes[i].scale && scale != 1.0f) {
-        /* Normalize albedo and normal passes as they are scaled by the number of samples.
-         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
-        array<float> &scaled_buffer = passes[i].scaled_buffer;
-        scaled_buffer.resize(w * h * 3);
-
-        for (int y = 0; y < h; y++) {
-          const float *pass_row = buffer + buffer_offset + y * row_stride;
-          float *scaled_row = scaled_buffer.data() + y * w * 3;
-
-          for (int x = 0; x < w; x++) {
-            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
-            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
-            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
-          }
-        }
-
-        oidn_filter.setImage(
-            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
-      }
-      else {
-        oidn_filter.setImage(passes[i].name,
-                             buffer + buffer_offset,
-                             oidn::Format::Float3,
-                             w,
-                             h,
-                             0,
-                             pixel_stride * sizeof(float),
-                             row_stride * sizeof(float));
-      }
-    }
-
-    /* Execute filter. */
-    oidn_filter.commit();
-    oidn_filter.execute();
-#else
-    (void)task;
-    (void)buffer;
-    (void)offset;
-    (void)stride;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)scale;
-#endif
-  }
-
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
-  {
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      /* Copy pixels from compute device to CPU (no-op for CPU device). */
-      rtile.buffers->buffer.copy_from_device();
-
-      denoise_openimagedenoise_buffer(task,
-                                      (float *)rtile.buffer,
-                                      rtile.offset,
-                                      rtile.stride,
-                                      rtile.x,
-                                      rtile.y,
-                                      rtile.w,
-                                      rtile.h,
-                                      1.0f / rtile.sample);
-
-      /* todo: it may be possible to avoid this copy, but we have to ensure that
-       * when other code copies data from the device it doesn't overwrite the
-       * denoiser buffers. */
-      rtile.buffers->buffer.copy_to_device();
-    }
-    else {
-      /* Per-tile denoising. */
-      rtile.sample = rtile.start_sample + rtile.num_samples;
-      const float scale = 1.0f / rtile.sample;
-      const float invscale = rtile.sample;
-      const size_t pass_stride = task.pass_stride;
-
-      /* Map neighboring tiles into one buffer for denoising. */
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      rtile = center_tile;
-
-      /* Calculate size of the tile to denoise (including overlap). The overlap
-       * size was chosen empirically. OpenImageDenoise specifies an overlap size
-       * of 128 but this is significantly bigger than typical tile size. */
-      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
-      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-
-      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
-      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
-
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        RenderTile &ntile = neighbors.tiles[i];
-        if (!ntile.buffer) {
-          continue;
-        }
-
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
-            merged_buffer[x] = tile_buffer[x] * scale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      /* Denoise */
-      denoise_openimagedenoise_buffer(
-          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
-
-      /* Copy back result from merged buffer. */
-      RenderTile &ntile = neighbors.target;
-      if (ntile.buffer) {
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
-            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
-            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
-            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-  }
-
-  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
-  {
-    ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
-    tile.sample = tile.start_sample + tile.num_samples;
-
-    denoising.functions.construct_transform = function_bind(
-        &CPUDevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
-    denoising.render_buffer.samples = tile.sample;
-    denoising.buffer.gpu_temporary_mem = false;
-
-    denoising.run_denoising(tile);
-  }
-
-  void thread_render(DeviceTask &task)
-  {
-    if (TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        return;
-    }
-
-    /* allocate buffer for kernel globals */
-    device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
-        KernelGlobals(thread_kernel_globals_init());
-
-    profiler.add_state(&kg->profiler);
-
-    CPUSplitKernel *split_kernel = NULL;
-    if (use_split_kernel) {
-      split_kernel = new CPUSplitKernel(this);
-      if (!split_kernel->load_kernels(requested_features)) {
-        thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-        kgbuffer.free();
-        delete split_kernel;
-        return;
-      }
-    }
-
-    /* NLM denoiser. */
-    DenoisingTask *denoising = NULL;
-
-    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
-     * avoid waiting with mutex locks in the denoiser, we let only a single
-     * thread acquire denoising tiles. */
-    uint tile_types = task.tile_types;
-    bool hold_denoise_lock = false;
-    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      if (!oidn_task_lock.try_lock()) {
-        tile_types &= ~RenderTile::DENOISE;
-        hold_denoise_lock = true;
-      }
-    }
-
-    RenderTile tile;
-    while (task.acquire_tile(this, tile, tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
-        }
-        else {
-          render(task, tile, kg);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, kg);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-          denoise_openimagedenoise(task, tile);
-        }
-        else if (task.denoising.type == DENOISER_NLM) {
-          if (denoising == NULL) {
-            denoising = new DenoisingTask(this, task);
-            denoising->profiler = &kg->profiler;
-          }
-          denoise_nlm(*denoising, tile);
-        }
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    if (hold_denoise_lock) {
-      oidn_task_lock.unlock();
-    }
-
-    profiler.remove_state(&kg->profiler);
-
-    thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-    kg->~KernelGlobals();
-    kgbuffer.free();
-    delete split_kernel;
-    delete denoising;
-  }
-
-  void thread_denoise(DeviceTask &task)
-  {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      denoise_openimagedenoise(task, tile);
-    }
-    else {
-      DenoisingTask denoising(this, task);
-
-      ProfilingState denoising_profiler_state;
-      profiler.add_state(&denoising_profiler_state);
-      denoising.profiler = &denoising_profiler_state;
-
-      denoise_nlm(denoising, tile);
-
-      profiler.remove_state(&denoising_profiler_state);
-    }
-
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-
-  void thread_film_convert(DeviceTask &task)
-  {
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    if (task.rgba_half) {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_half_float_kernel()(&kernel_globals,
-                                         (uchar4 *)task.rgba_half,
-                                         (float *)task.buffer,
-                                         sample_scale,
-                                         x,
-                                         y,
-                                         task.offset,
-                                         task.stride);
-    }
-    else {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_byte_kernel()(&kernel_globals,
-                                   (uchar4 *)task.rgba_byte,
-                                   (float *)task.buffer,
-                                   sample_scale,
-                                   x,
-                                   y,
-                                   task.offset,
-                                   task.stride);
-    }
-  }
-
-  void thread_shader(DeviceTask &task)
-  {
-    KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
-
-    for (int sample = 0; sample < task.num_samples; sample++) {
-      for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-        shader_kernel()(kg,
-                        (uint4 *)task.shader_input,
-                        (float4 *)task.shader_output,
-                        task.shader_eval_type,
-                        task.shader_filter,
-                        x,
-                        task.offset,
-                        sample);
-
-      if (task.get_cancel() || TaskPool::canceled())
-        break;
-
-      task.update_progress(NULL);
-    }
-
-    thread_kernel_globals_free(kg);
-    delete kg;
-  }
-
-  virtual int get_split_task_count(DeviceTask &task) override
-  {
-    if (task.type == DeviceTask::SHADER)
-      return task.get_subtask_count(info.cpu_threads, 256);
-    else
-      return task.get_subtask_count(info.cpu_threads);
-  }
-
-  virtual void task_add(DeviceTask &task) override
-  {
-    /* Load texture info. */
-    load_texture_info();
-
-    /* split task into smaller ones */
-    list<DeviceTask> tasks;
-
-    if (task.type == DeviceTask::DENOISE_BUFFER &&
-        task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      /* Denoise entire buffer at once with OIDN, it has own threading. */
-      tasks.push_back(task);
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      task.split(tasks, info.cpu_threads, 256);
-    }
-    else {
-      task.split(tasks, info.cpu_threads);
-    }
-
-    foreach (DeviceTask &task, tasks) {
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy);
-      });
-    }
-  }
-
-  virtual void task_wait() override
-  {
-    task_pool.wait_work();
-  }
-
-  virtual void task_cancel() override
-  {
-    task_pool.cancel();
-  }
-
- protected:
-  inline KernelGlobals thread_kernel_globals_init()
-  {
-    KernelGlobals kg = kernel_globals;
-    kg.transparent_shadow_intersections = NULL;
-    const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
-                                sizeof(*kg.decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      kg.decoupled_volume_steps[i] = NULL;
-    }
-    kg.decoupled_volume_steps_index = 0;
-    kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
-#ifdef WITH_OSL
-    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-    return kg;
-  }
-
-  inline void thread_kernel_globals_free(KernelGlobals *kg)
-  {
-    if (kg == NULL) {
-      return;
-    }
-
-    if (kg->transparent_shadow_intersections != NULL) {
-      free(kg->transparent_shadow_intersections);
-    }
-    const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
-                                sizeof(*kg->decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      if (kg->decoupled_volume_steps[i] != NULL) {
-        free(kg->decoupled_volume_steps[i]);
-      }
-    }
-#ifdef WITH_OSL
-    OSLShader::thread_free(kg);
-#endif
-  }
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
-  {
-    requested_features = requested_features_;
-
-    return true;
-  }
-};
-
-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
- public:
-  CPUDevice *device;
-  void (*func)(KernelGlobals *kg, KernelData *data);
-
-  CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
-  {
-  }
-  ~CPUSplitKernelFunction()
-  {
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim,
-                       device_memory &kernel_globals,
-                       device_memory &data)
-  {
-    if (!func) {
-      return false;
-    }
-
-    KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-    kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-    for (int y = 0; y < dim.global_size[1]; y++) {
-      for (int x = 0; x < dim.global_size[0]; x++) {
-        kg->global_id = make_int2(x, y);
-
-        func(kg, (KernelData *)data.device_pointer);
-      }
-    }
-
-    return true;
-  }
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                    RenderTile &rtile,
-                                                    int num_global_elements,
-                                                    device_memory &kernel_globals,
-                                                    device_memory &data,
-                                                    device_memory &split_data,
-                                                    device_memory &ray_state,
-                                                    device_memory &queue_index,
-                                                    device_memory &use_queues_flags,
-                                                    device_memory &work_pool_wgs)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-  for (int y = 0; y < dim.global_size[1]; y++) {
-    for (int x = 0; x < dim.global_size[0]; x++) {
-      kg->global_id = make_int2(x, y);
-
-      device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
-                                 (KernelData *)data.device_pointer,
-                                 (void *)split_data.device_pointer,
-                                 num_global_elements,
-                                 (char *)ray_state.device_pointer,
-                                 rtile.start_sample,
-                                 rtile.start_sample + rtile.num_samples,
-                                 rtile.x,
-                                 rtile.y,
-                                 rtile.w,
-                                 rtile.h,
-                                 rtile.offset,
-                                 rtile.stride,
-                                 (int *)queue_index.device_pointer,
-                                 dim.global_size[0] * dim.global_size[1],
-                                 (char *)use_queues_flags.device_pointer,
-                                 (uint *)work_pool_wgs.device_pointer,
-                                 rtile.num_samples,
-                                 (float *)rtile.buffer);
-    }
-  }
-
-  return true;
-}
-
-SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                               const DeviceRequestedFeatures &)
-{
-  CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
-  kernel->func = device->split_kernels[kernel_name]();
-  if (!kernel->func) {
-    delete kernel;
-    return NULL;
-  }
-
-  return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
-  return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
-                                              device_memory & /*data*/,
-                                              DeviceTask & /*task*/)
-{
-  return make_int2(1, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
-                                           device_memory & /*data*/,
-                                           size_t num_threads)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-
-  return split_data_buffer_size(kg, num_threads);
-}
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new CPUDevice(info, stats, profiler, background);
-}
-
-void device_cpu_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_CPU;
-  info.description = system_cpu_brand_string();
-  info.id = "CPU";
-  info.num = 0;
-  info.has_volume_decoupled = true;
-  info.has_adaptive_stop_per_sample = true;
-  info.has_osl = true;
-  info.has_half_images = true;
-  info.has_nanovdb = true;
-  info.has_profiling = true;
-  info.denoisers = DENOISER_NLM;
-  if (openimagedenoise_supported()) {
-    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
-  }
-
-  devices.insert(devices.begin(), info);
-}
-
-string device_cpu_capabilities()
-{
-  string capabilities = "";
-  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
-  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
-  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
-  capabilities += system_cpu_support_avx() ? "AVX " : "";
-  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
-  if (capabilities[capabilities.size() - 1] == ' ')
-    capabilities.resize(capabilities.size() - 1);
-  return capabilities;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp
new file mode 100644
index 00000000000..aea7868f65d
--- /dev/null
+++ b/intern/cycles/device/device_denoise.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *denoiserTypeToHumanReadable(DenoiserType type)
+{
+  switch (type) {
+    case DENOISER_OPTIX:
+      return "OptiX";
+    case DENOISER_OPENIMAGEDENOISE:
+      return "OpenImageDenoise";
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      return "UNKNOWN";
+  }
+
+  return "UNKNOWN";
+}
+
+const NodeEnum *DenoiseParams::get_type_enum()
+{
+  static NodeEnum type_enum;
+
+  if (type_enum.empty()) {
+    type_enum.insert("optix", DENOISER_OPTIX);
+    type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE);
+  }
+
+  return &type_enum;
+}
+
+const NodeEnum *DenoiseParams::get_prefilter_enum()
+{
+  static NodeEnum prefilter_enum;
+
+  if (prefilter_enum.empty()) {
+    prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+    prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+    prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+  }
+
+  return &prefilter_enum;
+}
+
+NODE_DEFINE(DenoiseParams)
+{
+  NodeType *type = NodeType::add("denoise_params", create);
+
+  const NodeEnum *type_enum = get_type_enum();
+  const NodeEnum *prefilter_enum = get_prefilter_enum();
+
+  SOCKET_BOOLEAN(use, "Use", false);
+
+  SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE);
+
+  SOCKET_INT(start_sample, "Start Sample", 0);
+
+  SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
+  SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);
+
+  SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);
+
+  return type;
+}
+
+DenoiseParams::DenoiseParams() : Node(get_node_type())
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h
new file mode 100644
index 00000000000..02ee63fb0ad
--- /dev/null
+++ b/intern/cycles/device/device_denoise.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+#include "graph/node.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum DenoiserType {
+  DENOISER_OPTIX = 2,
+  DENOISER_OPENIMAGEDENOISE = 4,
+  DENOISER_NUM,
+
+  DENOISER_NONE = 0,
+  DENOISER_ALL = ~0,
+};
+
+/* COnstruct human-readable string which denotes the denoiser type. */
+const char *denoiserTypeToHumanReadable(DenoiserType type);
+
+typedef int DenoiserTypeMask;
+
+enum DenoiserPrefilter {
+  /* Best quality of the result without extra processing time, but requires guiding passes to be
+   * noise-free. */
+  DENOISER_PREFILTER_NONE = 1,
+
+  /* Denoise color and guiding passes together.
+   * Improves quality when guiding passes are noisy using least amount of extra processing time. */
+  DENOISER_PREFILTER_FAST = 2,
+
+  /* Prefilter noisy guiding passes before denoising color.
+   * Improves quality when guiding passes are noisy using extra processing time. */
+  DENOISER_PREFILTER_ACCURATE = 3,
+
+  DENOISER_PREFILTER_NUM,
+};
+
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization.
+ * The default values here do not really matter as they are always initialized from the
+ * Integrator node. */
+class DenoiseParams : public Node {
+ public:
+  NODE_DECLARE
+
+  /* Apply denoiser to image. */
+  bool use = false;
+
+  /* Denoiser type. */
+  DenoiserType type = DENOISER_OPENIMAGEDENOISE;
+
+  /* Viewport start sample. */
+  int start_sample = 0;
+
+  /* Auxiliry passes. */
+  bool use_pass_albedo = true;
+  bool use_pass_normal = true;
+
+  DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;
+
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_prefilter_enum();
+
+  DenoiseParams();
+
+  bool modified(const DenoiseParams &other) const
+  {
+    return !(use == other.use && type == other.type && start_sample == other.start_sample &&
+             use_pass_albedo == other.use_pass_albedo &&
+             use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+  }
+};
+
+/* All the parameters needed to perform buffer denoising on a device.
+ * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+ * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+ * single place where they are all listed, so that it's not required to modify all device methods
+ * when these parameters do change. */
+class DeviceDenoiseTask {
+ public:
+  DenoiseParams params;
+
+  int num_samples;
+
+  RenderBuffers *render_buffers;
+  BufferParams buffer_params;
+
+  /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+   * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+   * tracer) point of view. */
+  bool allow_inplace_modification;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
deleted file mode 100644
index 38c42d15cab..00000000000
--- a/intern/cycles/device/device_denoising.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_denoising.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
-    : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
-      profiler(NULL),
-      storage(device),
-      buffer(device),
-      device(device)
-{
-  radius = task.denoising.radius;
-  nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
-  if (task.denoising.relative_pca) {
-    pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
-  }
-  else {
-    pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
-  }
-
-  render_buffer.frame_stride = task.frame_stride;
-  render_buffer.pass_stride = task.pass_stride;
-  render_buffer.offset = task.pass_denoising_data;
-
-  target_buffer.pass_stride = task.target_pass_stride;
-  target_buffer.denoising_clean_offset = task.pass_denoising_clean;
-  target_buffer.offset = 0;
-
-  functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
-  functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
-  tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
-  tile_info->from_render = task.denoising_from_render ? 1 : 0;
-
-  tile_info->frames[0] = 0;
-  tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
-  for (int i = 1; i < tile_info->num_frames; i++) {
-    tile_info->frames[i] = task.denoising_frames[i - 1];
-  }
-
-  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
-  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
-}
-
-DenoisingTask::~DenoisingTask()
-{
-  storage.XtWX.free();
-  storage.XtWY.free();
-  storage.transform.free();
-  storage.rank.free();
-  buffer.mem.free();
-  buffer.temporary_mem.free();
-  tile_info_mem.free();
-}
-
-void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
-{
-  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-    RenderTile &rtile = neighbors.tiles[i];
-    tile_info->offsets[i] = rtile.offset;
-    tile_info->strides[i] = rtile.stride;
-    tile_info->buffers[i] = rtile.buffer;
-  }
-  tile_info->x[0] = neighbors.tiles[3].x;
-  tile_info->x[1] = neighbors.tiles[4].x;
-  tile_info->x[2] = neighbors.tiles[5].x;
-  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-  tile_info->y[0] = neighbors.tiles[1].y;
-  tile_info->y[1] = neighbors.tiles[4].y;
-  tile_info->y[2] = neighbors.tiles[7].y;
-  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-
-  target_buffer.offset = neighbors.target.offset;
-  target_buffer.stride = neighbors.target.stride;
-  target_buffer.ptr = neighbors.target.buffer;
-
-  if (do_prefilter && neighbors.target.buffers) {
-    target_buffer.denoising_output_offset =
-        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
-  }
-  else {
-    target_buffer.denoising_output_offset = 0;
-  }
-
-  tile_info_mem.copy_to_device();
-}
-
-void DenoisingTask::setup_denoising_buffer()
-{
-  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
-   * tiles */
-  rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
-  rect = rect_expand(rect, radius);
-  rect = rect_clip(rect,
-                   make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
-  buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
-  buffer.passes = buffer.use_intensity ? 15 : 14;
-  buffer.width = rect.z - rect.x;
-  buffer.stride = align_up(buffer.width, 4);
-  buffer.h = rect.w - rect.y;
-  int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
-  buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
-  buffer.frame_stride = buffer.pass_stride * buffer.passes;
-  /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-  int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
-  buffer.mem.alloc_to_device(mem_size, false);
-  buffer.use_time = (tile_info->num_frames > 1);
-
-  /* CPUs process shifts sequentially while GPUs process them in parallel. */
-  int num_layers;
-  if (buffer.gpu_temporary_mem) {
-    /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
-    int max_radius = max(radius, 6);
-    int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
-    num_layers = 2 * num_shifts + 1;
-  }
-  else {
-    num_layers = 3;
-  }
-  /* Allocate two layers per shift as well as one for the weight accumulation. */
-  buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
-}
-
-void DenoisingTask::prefilter_shadowing()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
-
-  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
-   * sample variance and the buffer variance. */
-  functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
-  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
-   * sample variance. */
-  nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
-  functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
-  /* Reuse memory, the previous data isn't needed anymore. */
-  device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
-  /* Use the smoothed variance to filter the two shadow half images using each other for weight
-   * calculation. */
-  nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
-  functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
-  functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
-  device_ptr residual_var = *sample_var_var;
-  /* Estimate the residual variance between the two filtered halves. */
-  functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
-  device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
-  /* Use the residual variance for a second filter pass. */
-  nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
-  functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
-  functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
-  /* Combine the two double-filtered halves to a final shadow feature. */
-  device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
-  functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
-}
-
-void DenoisingTask::prefilter_features()
-{
-  device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
-
-  int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
-  int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
-  int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
-  for (int pass = 0; pass < 7; pass++) {
-    device_sub_ptr feature_pass(
-        buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
-    /* Get the unfiltered pass and its variance from the RenderBuffers. */
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *unfiltered,
-                          *variance,
-                          1.0f / render_buffer.samples);
-    /* Smooth the pass and store the result in the denoising buffers. */
-    nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
-    functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
-  }
-}
-
-void DenoisingTask::prefilter_color()
-{
-  int mean_from[] = {20, 21, 22};
-  int variance_from[] = {23, 24, 25};
-  int mean_to[] = {8, 9, 10};
-  int variance_to[] = {11, 12, 13};
-  int num_color_passes = 3;
-
-  device_only_memory<float> temporary_color(device, "denoising temporary color");
-  temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
-
-  for (int pass = 0; pass < num_color_passes; pass++) {
-    device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
-    device_sub_ptr color_var_pass(
-        temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *color_pass,
-                          *color_var_pass,
-                          1.0f / render_buffer.samples);
-  }
-
-  device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr color_var_pass(
-      buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  functions.detect_outliers(
-      temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
-  if (buffer.use_intensity) {
-    device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-    nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
-    functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
-  }
-}
-
-void DenoisingTask::load_buffer()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  int original_offset = render_buffer.offset;
-
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int i = 0; i < tile_info->num_frames; i++) {
-    for (int pass = 0; pass < num_passes; pass++) {
-      device_sub_ptr to_pass(
-          buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
-      bool is_variance = (pass >= 11) && (pass <= 13);
-      functions.get_feature(
-          pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
-    }
-    render_buffer.offset += render_buffer.frame_stride;
-  }
-
-  render_buffer.offset = original_offset;
-}
-
-void DenoisingTask::write_buffer()
-{
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int pass = 0; pass < num_passes; pass++) {
-    device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
-    int out_offset = pass + target_buffer.denoising_output_offset;
-    functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
-  }
-}
-
-void DenoisingTask::construct_transform()
-{
-  storage.w = filter_area.z;
-  storage.h = filter_area.w;
-
-  storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
-  storage.rank.alloc_to_device(storage.w * storage.h, false);
-
-  functions.construct_transform();
-}
-
-void DenoisingTask::reconstruct()
-{
-  storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
-  storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
-  storage.XtWX.zero_to_device();
-  storage.XtWY.zero_to_device();
-
-  reconstruction_state.filter_window = rect_from_shape(
-      filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
-  int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  reconstruction_state.source_w = rect.z - rect.x;
-  reconstruction_state.source_h = rect.w - rect.y;
-
-  device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
-  for (int f = 0; f < tile_info->num_frames; f++) {
-    device_ptr scale_ptr = 0;
-    device_sub_ptr *scale_sub_ptr = NULL;
-    if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
-      scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-      scale_ptr = **scale_sub_ptr;
-    }
-
-    functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
-    delete scale_sub_ptr;
-  }
-  functions.solve(target_buffer.ptr);
-}
-
-void DenoisingTask::run_denoising(RenderTile &tile)
-{
-  RenderTileNeighbors neighbors(tile);
-  functions.map_neighbor_tiles(neighbors);
-  set_render_buffer(neighbors);
-
-  setup_denoising_buffer();
-
-  if (tile_info->from_render) {
-    prefilter_shadowing();
-    prefilter_features();
-    prefilter_color();
-  }
-  else {
-    load_buffer();
-  }
-
-  if (do_filter) {
-    construct_transform();
-    reconstruct();
-  }
-
-  if (do_prefilter) {
-    write_buffer();
-  }
-
-  functions.unmap_neighbor_tiles(neighbors);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
deleted file mode 100644
index bb8bdfdd225..00000000000
--- a/intern/cycles/device/device_denoising.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_DENOISING_H__
-#define __DEVICE_DENOISING_H__
-
-#include "device/device.h"
-
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "util/util_profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-class DenoisingTask {
- public:
-  /* Parameters of the denoising algorithm. */
-  int radius;
-  float nlm_k_2;
-  float pca_threshold;
-
-  /* Parameters of the RenderBuffers. */
-  struct RenderBuffers {
-    int offset;
-    int pass_stride;
-    int frame_stride;
-    int samples;
-  } render_buffer;
-
-  /* Pointer and parameters of the target buffer. */
-  struct TargetBuffer {
-    int offset;
-    int stride;
-    int pass_stride;
-    int denoising_clean_offset;
-    int denoising_output_offset;
-    device_ptr ptr;
-  } target_buffer;
-
-  TileInfo *tile_info;
-  device_vector<int> tile_info_mem;
-
-  ProfilingState *profiler;
-
-  int4 rect;
-  int4 filter_area;
-
-  bool do_prefilter;
-  bool do_filter;
-
-  struct DeviceFunctions {
-    function<bool(
-        device_ptr image_ptr,    /* Contains the values that are smoothed. */
-        device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
-        device_ptr variance_ptr, /* Contains the variance of the guide image. */
-        device_ptr out_ptr       /* The filtered output is written into this image. */
-        )>
-        non_local_means;
-    function<bool(
-        device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
-        accumulate;
-    function<bool(device_ptr output_ptr)> solve;
-    function<bool()> construct_transform;
-
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  int r,
-                  int4 rect)>
-        combine_halves;
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr sample_variance_ptr,
-                  device_ptr sv_variance_ptr,
-                  device_ptr buffer_variance_ptr)>
-        divide_shadow;
-    function<bool(int mean_offset,
-                  int variance_offset,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  float scale)>
-        get_feature;
-    function<bool(device_ptr image_ptr,
-                  device_ptr variance_ptr,
-                  device_ptr depth_ptr,
-                  device_ptr output_ptr)>
-        detect_outliers;
-    function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
-    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
-  } functions;
-
-  /* Stores state of the current Reconstruction operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct ReconstructionState {
-    int4 filter_window;
-    int4 buffer_params;
-
-    int source_w;
-    int source_h;
-  } reconstruction_state;
-
-  /* Stores state of the current NLM operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct NLMState {
-    int r;     /* Search radius of the filter. */
-    int f;     /* Patch size of the filter. */
-    float a;   /* Variance compensation factor in the MSE estimation. */
-    float k_2; /* Squared value of the k parameter of the filter. */
-    bool is_color;
-
-    void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
-    {
-      r = r_;
-      f = f_;
-      a = a_, k_2 = k_2_;
-      is_color = is_color_;
-    }
-  } nlm_state;
-
-  struct Storage {
-    device_only_memory<float> transform;
-    device_only_memory<int> rank;
-    device_only_memory<float> XtWX;
-    device_only_memory<float3> XtWY;
-    int w;
-    int h;
-
-    Storage(Device *device)
-        : transform(device, "denoising transform"),
-          rank(device, "denoising rank"),
-          XtWX(device, "denoising XtWX"),
-          XtWY(device, "denoising XtWY")
-    {
-    }
-  } storage;
-
-  DenoisingTask(Device *device, const DeviceTask &task);
-  ~DenoisingTask();
-
-  void run_denoising(RenderTile &tile);
-
-  struct DenoiseBuffers {
-    int pass_stride;
-    int passes;
-    int stride;
-    int h;
-    int width;
-    int frame_stride;
-    device_only_memory<float> mem;
-    device_only_memory<float> temporary_mem;
-    bool use_time;
-    bool use_intensity;
-
-    bool gpu_temporary_mem;
-
-    DenoiseBuffers(Device *device)
-        : mem(device, "denoising pixel buffer"),
-          temporary_mem(device, "denoising temporary mem", true)
-    {
-    }
-  } buffer;
-
- protected:
-  Device *device;
-
-  void set_render_buffer(RenderTileNeighbors &neighbors);
-  void setup_denoising_buffer();
-  void prefilter_shadowing();
-  void prefilter_features();
-  void prefilter_color();
-  void construct_transform();
-  void reconstruct();
-
-  void load_buffer();
-  void write_buffer();
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/device/device_graphics_interop.cpp
index fa210e747c0..a80a236759f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/device/device_graphics_interop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,8 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_path_init.h"
+#include "device/device_graphics_interop.h"
 
-#define KERNEL_NAME path_init
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+CCL_NAMESPACE_BEGIN
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
new file mode 100644
index 00000000000..671b1c189d7
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Information about interoperability destination.
+ * Is provided by the GPUDisplay. */
+class DeviceGraphicsInteropDestination {
+ public:
+  /* Dimensions of the buffer, in pixels. */
+  int buffer_width = 0;
+  int buffer_height = 0;
+
+  /* OpenGL pixel buffer object. */
+  int opengl_pbo_id = 0;
+
+  /* Clear the entire destination before doing partial write to it. */
+  bool need_clear = false;
+};
+
+/* Device-side graphics interoperability support.
+ *
+ * Takes care of holding all the handlers needed by the device to implement interoperability with
+ * the graphics library. */
+class DeviceGraphicsInterop {
+ public:
+  DeviceGraphicsInterop() = default;
+  virtual ~DeviceGraphicsInterop() = default;
+
+  /* Update this device-side graphics interoperability object with the given destination resource
+   * information. */
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+
+  virtual device_ptr map() = 0;
+  virtual void unmap() = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
deleted file mode 100644
index ecc79c5d7ee..00000000000
--- a/intern/cycles/device/device_intern.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_INTERN_H__
-#define __DEVICE_INTERN_H__
-
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Device;
-class DeviceInfo;
-class Profiler;
-class Stats;
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string> &parameters);
-bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_optix_init();
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address);
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo> &devices);
-void device_opencl_info(vector<DeviceInfo> &devices);
-void device_cuda_info(vector<DeviceInfo> &devices);
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
-void device_network_info(vector<DeviceInfo> &devices);
-
-string device_cpu_capabilities();
-string device_opencl_capabilities();
-string device_cuda_capabilities();
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
new file mode 100644
index 00000000000..ceaddee4756
--- /dev/null
+++ b/intern/cycles/device/device_kernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_kernel.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel)
+{
+  switch (kernel) {
+    /* Integrator. */
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+      return "integrator_init_from_camera";
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+      return "integrator_init_from_bake";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      return "integrator_intersect_closest";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      return "integrator_intersect_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      return "integrator_intersect_subsurface";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      return "integrator_intersect_volume_stack";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      return "integrator_shade_background";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      return "integrator_shade_light";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      return "integrator_shade_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      return "integrator_shade_surface";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      return "integrator_shade_surface_raytrace";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      return "integrator_shade_volume";
+    case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+      return "integrator_megakernel";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+      return "integrator_queued_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+      return "integrator_queued_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+      return "integrator_active_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+      return "integrator_terminated_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      return "integrator_compact_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+      return "integrator_compact_states";
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+      return "integrator_reset";
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      return "integrator_shadow_catcher_count_possible_splits";
+
+    /* Shader evaluation. */
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      return "shader_eval_displace";
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      return "shader_eval_background";
+
+      /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+    return "film_convert_" #variant_lowercase; \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+    return "film_convert_" #variant_lowercase "_half_rgba";
+
+      FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+      FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+      FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+      FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+      FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+                                    shadow_catcher_matte_with_shadow)
+      FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
+
+    /* Adaptive sampling. */
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
+      return "adaptive_sampling_convergence_check";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
+      return "adaptive_sampling_filter_x";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
+      return "adaptive_sampling_filter_y";
+
+    /* Denoising. */
+    case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS:
+      return "filter_guiding_preprocess";
+    case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO:
+      return "filter_guiding_set_fake_albedo";
+    case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS:
+      return "filter_color_preprocess";
+    case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
+      return "filter_color_postprocess";
+
+    /* Cryptomatte. */
+    case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+      return "cryptomatte_postprocess";
+
+    /* Generic */
+    case DEVICE_KERNEL_PREFIX_SUM:
+      return "prefix_sum";
+
+    case DEVICE_KERNEL_NUM:
+      break;
+  };
+  LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
+{
+  os << device_kernel_as_string(kernel);
+  return os;
+}
+
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+  string str;
+
+  for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+    if (mask & (uint64_t(1) << i)) {
+      if (!str.empty()) {
+        str += " ";
+      }
+      str += device_kernel_as_string((DeviceKernel)i);
+    }
+  }
+
+  return str;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/device/device_kernel.h
index 9e1e57beba6..83d959ca87b 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/device/device_kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,20 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#pragma once
 
-#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
-#define LOCALS_TYPE BackgroundAOLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "kernel/kernel_types.h"
 
+#include "util/util_string.h"
+
+#include <ostream>  // NOLINT
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel);
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 80a05fc32fe..c4d45829b83 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 device_memory::device_memory(Device *device, const char *name, MemoryType type)
     : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements),
+      data_elements(device_type_traits<uchar>::num_elements_cpu),
       data_size(0),
       device_size(0),
       data_width(0),
@@ -149,6 +149,11 @@ void device_memory::device_zero()
   }
 }
 
+bool device_memory::device_is_cpu()
+{
+  return (device->info.type == DEVICE_CPU);
+}
+
 void device_memory::swap_device(Device *new_device,
                                 size_t new_device_size,
                                 device_ptr new_device_ptr)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 80f4d7b0468..c51594b8580 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -38,7 +38,6 @@ enum MemoryType {
   MEM_DEVICE_ONLY,
   MEM_GLOBAL,
   MEM_TEXTURE,
-  MEM_PIXELS
 };
 
 /* Supported Data Types */
@@ -54,7 +53,7 @@ enum DataType {
   TYPE_UINT64,
 };
 
-static inline size_t datatype_size(DataType datatype)
+static constexpr size_t datatype_size(DataType datatype)
 {
   switch (datatype) {
     case TYPE_UNKNOWN:
@@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype)
 
 template<typename T> struct device_type_traits {
   static const DataType data_type = TYPE_UNKNOWN;
-  static const int num_elements = sizeof(T);
+  static const int num_elements_cpu = sizeof(T);
+  static const int num_elements_gpu = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar2> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar3> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar4> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint2> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint3> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint4> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int2> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int3> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int4> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float2> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float3> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float4> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<ushort4> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint16_t> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half4> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint64_t> {
   static const DataType data_type = TYPE_UINT64;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 /* Device Memory
@@ -257,6 +299,8 @@ class device_memory {
   void device_copy_from(int y, int w, int h, int elem);
   void device_zero();
 
+  bool device_is_cpu();
+
   device_ptr original_device_ptr;
   size_t original_device_size;
   Device *original_device;
@@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory {
       : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_type_traits<T>::num_elements, 1);
+    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
+                                          device_type_traits<T>::num_elements_gpu,
+                        1);
   }
 
   device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory {
 
 template<typename T> class device_vector : public device_memory {
  public:
+  /* Can only use this for types that have the same size on CPU and GPU. */
+  static_assert(device_type_traits<T>::num_elements_cpu ==
+                device_type_traits<T>::num_elements_gpu);
+
   device_vector(Device *device, const char *name, MemoryType type)
       : device_memory(device, name, type)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements;
+    data_elements = device_type_traits<T>::num_elements_cpu;
     modified = true;
     need_realloc_ = true;
 
@@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory {
     return (T *)host_pointer;
   }
 
+  const T *data() const
+  {
+    return (T *)host_pointer;
+  }
+
   T &operator[](size_t i)
   {
     assert(i < data_size);
@@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory {
 
   void copy_from_device()
   {
-    device_copy_from(0, data_width, data_height, sizeof(T));
+    device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
   }
 
   void copy_from_device(int y, int w, int h)
@@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory {
   }
 };
 
-/* Pixel Memory
- *
- * Device memory to efficiently draw as pixels to the screen in interactive
- * rendering. Only copying pixels from the device is supported, not copying to. */
-
-template<typename T> class device_pixels : public device_vector<T> {
- public:
-  device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
-  {
-  }
-
-  void alloc_to_device(size_t width, size_t height, size_t depth = 0)
-  {
-    device_vector<T>::alloc(width, height, depth);
-
-    if (!device_memory::device_pointer) {
-      device_memory::device_alloc();
-    }
-  }
-
-  T *copy_from_device(int y, int w, int h)
-  {
-    device_memory::device_copy_from(y, w, h, sizeof(T));
-    return device_vector<T>::data();
-  }
-};
-
 /* Device Sub Memory
  *
  * Pointer into existing memory. It is not allocated separately, but created
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
deleted file mode 100644
index 85ffa5fcd52..00000000000
--- a/intern/cycles/device/device_multi.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <stdlib.h>
-
-#include "bvh/bvh_multi.h"
-
-#include "device/device.h"
-#include "device/device_intern.h"
-#include "device/device_network.h"
-
-#include "render/buffers.h"
-#include "render/geometry.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-class MultiDevice : public Device {
- public:
-  struct SubDevice {
-    Stats stats;
-    Device *device;
-    map<device_ptr, device_ptr> ptr_map;
-    int peer_island_index = -1;
-  };
-
-  list<SubDevice> devices, denoising_devices;
-  device_ptr unique_key;
-  vector<vector<SubDevice *>> peer_islands;
-  bool use_denoising;
-  bool matching_rendering_and_denoising_devices;
-
-  MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        unique_key(1),
-        use_denoising(!info.denoising_devices.empty())
-  {
-    foreach (DeviceInfo &subinfo, info.multi_devices) {
-      /* Always add CPU devices at the back since GPU devices can change
-       * host memory pointers, which CPU uses as device pointer. */
-      SubDevice *sub;
-      if (subinfo.type == DEVICE_CPU) {
-        devices.emplace_back();
-        sub = &devices.back();
-      }
-      else {
-        devices.emplace_front();
-        sub = &devices.front();
-      }
-
-      /* The pointer to 'sub->stats' will stay valid even after new devices
-       * are added, since 'devices' is a linked list. */
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      denoising_devices.emplace_front();
-      SubDevice *sub = &denoising_devices.front();
-
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    /* Build a list of peer islands for the available render devices */
-    foreach (SubDevice &sub, devices) {
-      /* First ensure that every device is in at least once peer island */
-      if (sub.peer_island_index < 0) {
-        peer_islands.emplace_back();
-        sub.peer_island_index = (int)peer_islands.size() - 1;
-        peer_islands[sub.peer_island_index].push_back(&sub);
-      }
-
-      if (!info.has_peer_memory) {
-        continue;
-      }
-
-      /* Second check peer access between devices and fill up the islands accordingly */
-      foreach (SubDevice &peer_sub, devices) {
-        if (peer_sub.peer_island_index < 0 &&
-            peer_sub.device->info.type == sub.device->info.type &&
-            peer_sub.device->check_peer_access(sub.device)) {
-          peer_sub.peer_island_index = sub.peer_island_index;
-          peer_islands[sub.peer_island_index].push_back(&peer_sub);
-        }
-      }
-    }
-
-    /* Try to re-use memory when denoising and render devices use the same physical devices
-     * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
-     * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
-    matching_rendering_and_denoising_devices = denoising_devices.empty() ||
-                                               (devices.size() == denoising_devices.size());
-    if (matching_rendering_and_denoising_devices) {
-      for (list<SubDevice>::iterator device_it = devices.begin(),
-                                     denoising_device_it = denoising_devices.begin();
-           device_it != devices.end() && denoising_device_it != denoising_devices.end();
-           ++device_it, ++denoising_device_it) {
-        const DeviceInfo &info = device_it->device->info;
-        const DeviceInfo &denoising_info = denoising_device_it->device->info;
-        if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
-            (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
-            info.num != denoising_info.num) {
-          matching_rendering_and_denoising_devices = false;
-          break;
-        }
-      }
-    }
-
-#ifdef WITH_NETWORK
-    /* try to add network devices */
-    ServerDiscovery discovery(true);
-    time_sleep(1.0);
-
-    vector<string> servers = discovery.get_server_list();
-
-    foreach (string &server, servers) {
-      Device *device = device_network_create(info, stats, profiler, server.c_str());
-      if (device)
-        devices.push_back(SubDevice(device));
-    }
-#endif
-  }
-
-  ~MultiDevice()
-  {
-    foreach (SubDevice &sub, devices)
-      delete sub.device;
-    foreach (SubDevice &sub, denoising_devices)
-      delete sub.device;
-  }
-
-  const string &error_message() override
-  {
-    error_msg.clear();
-
-    foreach (SubDevice &sub, devices)
-      error_msg += sub.device->error_message();
-    foreach (SubDevice &sub, denoising_devices)
-      error_msg += sub.device->error_message();
-
-    return error_msg;
-  }
-
-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
-    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
-    foreach (const SubDevice &sub_device, devices) {
-      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
-      bvh_layout_mask &= device_bvh_layout_mask;
-      bvh_layout_mask_all |= device_bvh_layout_mask;
-    }
-
-    /* With multiple OptiX devices, every device needs its own acceleration structure */
-    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
-      return BVH_LAYOUT_MULTI_OPTIX;
-    }
-
-    /* When devices do not share a common BVH layout, fall back to creating one for each */
-    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
-    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
-      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
-    }
-
-    return bvh_layout_mask;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->load_kernels(requested_features))
-        return false;
-
-    use_denoising = requested_features.use_denoising;
-    if (requested_features.use_denoising) {
-      /* Only need denoising feature, everything else is unused. */
-      DeviceRequestedFeatures denoising_features;
-      denoising_features.use_denoising = true;
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->load_kernels(denoising_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->wait_for_availability(requested_features))
-        return false;
-
-    if (requested_features.use_denoising) {
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->wait_for_availability(requested_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  DeviceKernelStatus get_active_kernel_switch_state() override
-  {
-    DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
-    foreach (SubDevice &sub, devices) {
-      DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
-      switch (subresult) {
-        case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
-        case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
-          return subresult;
-
-        case DEVICE_KERNEL_USING_FEATURE_KERNEL:
-        case DEVICE_KERNEL_UNKNOWN:
-          break;
-      }
-    }
-
-    return result;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    /* Try to build and share a single acceleration structure, if possible */
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
-      devices.back().device->build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
-           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
-
-    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
-    bvh_multi->sub_bvhs.resize(devices.size());
-
-    vector<BVHMulti *> geom_bvhs;
-    geom_bvhs.reserve(bvh->geometry.size());
-    foreach (Geometry *geom, bvh->geometry) {
-      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
-    }
-
-    /* Broadcast acceleration structure build to all render devices */
-    size_t i = 0;
-    foreach (SubDevice &sub, devices) {
-      /* Change geometry BVH pointers to the sub BVH */
-      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
-      }
-
-      if (!bvh_multi->sub_bvhs[i]) {
-        BVHParams params = bvh->params;
-        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
-          params.bvh_layout = BVH_LAYOUT_OPTIX;
-        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
-          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
-                                                                      BVH_LAYOUT_EMBREE;
-
-        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
-         * (since they are put into the top level directly, see bvh_embree.cpp) */
-        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
-            !bvh->geometry[0]->is_instanced()) {
-          i++;
-          continue;
-        }
-
-        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
-      }
-
-      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
-      i++;
-    }
-
-    /* Change geometry BVH pointers back to the multi BVH. */
-    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-      bvh->geometry[k]->bvh = geom_bvhs[k];
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-    if (devices.size() > 1) {
-      return NULL;
-    }
-    return devices.front().device->osl_memory();
-  }
-
-  bool is_resident(device_ptr key, Device *sub_device) override
-  {
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        return find_matching_mem_device(key, sub)->device == sub_device;
-      }
-    }
-    return false;
-  }
-
-  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
-  {
-    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
-
-    /* Get the memory owner of this key (first try current device, then peer devices) */
-    SubDevice *owner_sub = &sub;
-    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
-      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
-        if (island_sub != owner_sub &&
-            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
-          owner_sub = island_sub;
-        }
-      }
-    }
-    return owner_sub;
-  }
-
-  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
-  {
-    assert(!island.empty());
-
-    /* Get the memory owner of this key or the device with the lowest memory usage when new */
-    SubDevice *owner_sub = island.front();
-    foreach (SubDevice *island_sub, island) {
-      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
-                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
-        owner_sub = island_sub;
-      }
-    }
-    return owner_sub;
-  }
-
-  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
-  {
-    return find_matching_mem_device(key, sub)->ptr_map[key];
-  }
-
-  void mem_alloc(device_memory &mem) override
-  {
-    device_ptr key = unique_key++;
-
-    if (mem.type == MEM_PIXELS) {
-      /* Always allocate pixels memory on all devices
-       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        sub.device->mem_alloc(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
-             mem.type == MEM_DEVICE_ONLY);
-      /* The remaining memory types can be distributed across devices */
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        owner_sub->device->mem_alloc(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void mem_copy_to(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* The tile buffers are allocated on each device (see below), so copy to all of them */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_copy_to(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_copy_to(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-
-        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
-          /* Need to create texture objects and update pointer in kernel globals on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_copy_to(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
-  {
-    device_ptr key = mem.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-
-      SubDevice *owner_sub = find_matching_mem_device(key, sub);
-      mem.device = owner_sub->device;
-      mem.device_pointer = owner_sub->ptr_map[key];
-
-      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
-      i++;
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-  }
-
-  void mem_zero(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* This is a hack to only allocate the tile buffers on denoising devices
-     * Similarly the tile buffers also need to be allocated separately on all devices so any
-     * overlap rendered for denoising does not interfere with each other */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      vector<device_ptr> device_pointers;
-      device_pointers.reserve(devices.size());
-
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_zero(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-
-        device_pointers.push_back(mem.device_pointer);
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map[key] = device_pointers.front();
-          device_pointers.erase(device_pointers.begin());
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-          mem.device_size = existing_size;
-
-          sub.device->mem_zero(mem);
-          sub.ptr_map[key] = mem.device_pointer;
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_zero(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_free(device_memory &mem) override
-  {
-    device_ptr key = mem.device_pointer;
-    size_t existing_size = mem.device_size;
-
-    /* Free memory that was allocated for all devices (see above) on each device */
-    if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = sub.ptr_map[key];
-        mem.device_size = existing_size;
-
-        sub.device->mem_free(mem);
-        sub.ptr_map.erase(sub.ptr_map.find(key));
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map.erase(key);
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = sub.ptr_map[key];
-          mem.device_size = existing_size;
-
-          sub.device->mem_free(mem);
-          sub.ptr_map.erase(sub.ptr_map.find(key));
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
-        mem.device = owner_sub->device;
-        mem.device_pointer = owner_sub->ptr_map[key];
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_free(mem);
-        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
-
-        if (mem.type == MEM_TEXTURE) {
-          /* Free texture objects on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_free(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-    stats.mem_free(existing_size);
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->const_copy_to(name, host, size);
-  }
-
-  void draw_pixels(device_memory &rgba,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override
-  {
-    assert(rgba.type == MEM_PIXELS);
-
-    device_ptr key = rgba.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-    int sub_height = height / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-      int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
-      int sdy = dy + i * sub_height;
-      /* adjust math for w/width */
-
-      rgba.device_pointer = sub.ptr_map[key];
-      sub.device->draw_pixels(
-          rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
-      i++;
-    }
-
-    rgba.device_pointer = key;
-  }
-
-  void map_tile(Device *sub_device, RenderTile &tile) override
-  {
-    if (!tile.buffer) {
-      return;
-    }
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = find_matching_mem(tile.buffer, sub);
-        return;
-      }
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = sub.ptr_map[tile.buffer];
-        return;
-      }
-    }
-  }
-
-  int device_number(Device *sub_device) override
-  {
-    int i = 0;
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    return -1;
-  }
-
-  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-      tile.buffer = mem.device_pointer;
-
-      if (mem.device == this && matching_rendering_and_denoising_devices) {
-        /* Skip unnecessary copies in viewport mode (buffer covers the
-         * whole image), but still need to fix up the tile device pointer. */
-        map_tile(sub_device, tile);
-        continue;
-      }
-
-      /* If the tile was rendered on another device, copy its memory to
-       * to the current device now, for the duration of the denoising task.
-       * Note that this temporarily modifies the RenderBuffers and calls
-       * the device, so this function is not thread safe. */
-      if (mem.device != sub_device) {
-        /* Only copy from device to host once. This is faster, but
-         * also required for the case where a CPU thread is denoising
-         * a tile rendered on the GPU. In that case we have to avoid
-         * overwriting the buffer being de-noised by the CPU thread. */
-        if (!tile.buffers->map_neighbor_copied) {
-          tile.buffers->map_neighbor_copied = true;
-          mem.copy_from_device();
-        }
-
-        if (mem.device == this) {
-          /* Can re-use memory if tile is already allocated on the sub device. */
-          map_tile(sub_device, tile);
-          mem.swap_device(sub_device, mem.device_size, tile.buffer);
-        }
-        else {
-          mem.swap_device(sub_device, 0, 0);
-        }
-
-        mem.copy_to_device();
-
-        tile.buffer = mem.device_pointer;
-        tile.device_size = mem.device_size;
-
-        mem.restore_device();
-      }
-    }
-  }
-
-  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    RenderTile &target_tile = neighbors.target;
-    device_vector<float> &mem = target_tile.buffers->buffer;
-
-    if (mem.device == this && matching_rendering_and_denoising_devices) {
-      return;
-    }
-
-    /* Copy denoised result back to the host. */
-    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
-    mem.copy_from_device();
-    mem.restore_device();
-
-    /* Copy denoised result to the original device. */
-    mem.copy_to_device();
-
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-
-      if (mem.device != sub_device && mem.device != this) {
-        /* Free up memory again if it was allocated for the copy above. */
-        mem.swap_device(sub_device, tile.device_size, tile.buffer);
-        sub_device->mem_free(mem);
-        mem.restore_device();
-      }
-    }
-  }
-
-  int get_split_task_count(DeviceTask &task) override
-  {
-    int total_tasks = 0;
-    list<DeviceTask> tasks;
-    task.split(tasks, devices.size());
-    foreach (SubDevice &sub, devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        total_tasks += sub.device->get_split_task_count(subtask);
-      }
-    }
-    return total_tasks;
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    list<SubDevice> task_devices = devices;
-    if (!denoising_devices.empty()) {
-      if (task.type == DeviceTask::DENOISE_BUFFER) {
-        /* Denoising tasks should be redirected to the denoising devices entirely. */
-        task_devices = denoising_devices;
-      }
-      else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
-        const uint tile_types = task.tile_types;
-        /* For normal rendering tasks only redirect the denoising part to the denoising devices.
-         * Do not need to split the task here, since they all run through 'acquire_tile'. */
-        task.tile_types = RenderTile::DENOISE;
-        foreach (SubDevice &sub, denoising_devices) {
-          sub.device->task_add(task);
-        }
-        /* Rendering itself should still be executed on the rendering devices. */
-        task.tile_types = tile_types ^ RenderTile::DENOISE;
-      }
-    }
-
-    list<DeviceTask> tasks;
-    task.split(tasks, task_devices.size());
-
-    foreach (SubDevice &sub, task_devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        if (task.buffer)
-          subtask.buffer = find_matching_mem(task.buffer, sub);
-        if (task.rgba_byte)
-          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-        if (task.rgba_half)
-          subtask.rgba_half = sub.ptr_map[task.rgba_half];
-        if (task.shader_input)
-          subtask.shader_input = find_matching_mem(task.shader_input, sub);
-        if (task.shader_output)
-          subtask.shader_output = find_matching_mem(task.shader_output, sub);
-
-        sub.device->task_add(subtask);
-
-        if (task.buffers && task.buffers->buffer.device == this) {
-          /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
-          sub.device->task_wait();
-        }
-      }
-    }
-  }
-
-  void task_wait() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_wait();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_wait();
-  }
-
-  void task_cancel() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_cancel();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_cancel();
-  }
-};
-
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new MultiDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
deleted file mode 100644
index 8904b517e92..00000000000
--- a/intern/cycles/device/device_network.cpp
+++ /dev/null
@@ -1,812 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_network.h"
-#include "device/device.h"
-#include "device/device_intern.h"
-
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-
-#if defined(WITH_NETWORK)
-
-CCL_NAMESPACE_BEGIN
-
-typedef map<device_ptr, device_ptr> PtrMap;
-typedef vector<uint8_t> DataVector;
-typedef map<device_ptr, DataVector> DataMap;
-
-/* tile list */
-typedef vector<RenderTile> TileList;
-
-/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
-{
-  for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
-    if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
-      return it;
-  return tile_list.end();
-}
-
-class NetworkDevice : public Device {
- public:
-  boost::asio::io_service io_service;
-  tcp::socket socket;
-  device_ptr mem_counter;
-  DeviceTask the_task; /* todo: handle multiple tasks */
-
-  thread_mutex rpc_lock;
-
-  virtual bool show_samples() const
-  {
-    return false;
-  }
-
-  NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
-      : Device(info, stats, profiler, true), socket(io_service)
-  {
-    error_func = NetworkError();
-    stringstream portstr;
-    portstr << SERVER_PORT;
-
-    tcp::resolver resolver(io_service);
-    tcp::resolver::query query(address, portstr.str());
-    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
-    tcp::resolver::iterator end;
-
-    boost::system::error_code error = boost::asio::error::host_not_found;
-    while (error && endpoint_iterator != end) {
-      socket.close();
-      socket.connect(*endpoint_iterator++, error);
-    }
-
-    if (error)
-      error_func.network_error(error.message());
-
-    mem_counter = 0;
-  }
-
-  ~NetworkDevice()
-  {
-    RPCSend snd(socket, &error_func, "stop");
-    snd.write();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-    }
-
-    thread_scoped_lock lock(rpc_lock);
-
-    mem.device_pointer = ++mem_counter;
-
-    RPCSend snd(socket, &error_func, "mem_alloc");
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_copy_to");
-
-    snd.add(mem);
-    snd.write();
-    snd.write_buffer(mem.host_pointer, mem.memory_size());
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    size_t data_size = mem.memory_size();
-
-    RPCSend snd(socket, &error_func, "mem_copy_from");
-
-    snd.add(mem);
-    snd.add(y);
-    snd.add(w);
-    snd.add(h);
-    snd.add(elem);
-    snd.write();
-
-    RPCReceive rcv(socket, &error_func);
-    rcv.read_buffer(mem.host_pointer, data_size);
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_zero");
-
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      thread_scoped_lock lock(rpc_lock);
-
-      RPCSend snd(socket, &error_func, "mem_free");
-
-      snd.add(mem);
-      snd.write();
-
-      mem.device_pointer = 0;
-    }
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "const_copy_to");
-
-    string name_string(name);
-
-    snd.add(name_string);
-    snd.add(size);
-    snd.write();
-    snd.write_buffer(host, size);
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    if (error_func.have_error())
-      return false;
-
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "load_kernels");
-    snd.add(requested_features.experimental);
-    snd.add(requested_features.max_closure);
-    snd.add(requested_features.max_nodes_group);
-    snd.add(requested_features.nodes_features);
-    snd.write();
-
-    bool result;
-    RPCReceive rcv(socket, &error_func);
-    rcv.read(result);
-
-    return result;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    the_task = task;
-
-    RPCSend snd(socket, &error_func, "task_add");
-    snd.add(task);
-    snd.write();
-  }
-
-  void task_wait()
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "task_wait");
-    snd.write();
-
-    lock.unlock();
-
-    TileList the_tiles;
-
-    /* todo: run this threaded for connecting to multiple clients */
-    for (;;) {
-      if (error_func.have_error())
-        break;
-
-      RenderTile tile;
-
-      lock.lock();
-      RPCReceive rcv(socket, &error_func);
-
-      if (rcv.name == "acquire_tile") {
-        lock.unlock();
-
-        /* todo: watch out for recursive calls! */
-        if (the_task.acquire_tile(this, tile)) { /* write return as bool */
-          the_tiles.push_back(tile);
-
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile");
-          snd.add(tile);
-          snd.write();
-          lock.unlock();
-        }
-        else {
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile_none");
-          snd.write();
-          lock.unlock();
-        }
-      }
-      else if (rcv.name == "release_tile") {
-        rcv.read(tile);
-        lock.unlock();
-
-        TileList::iterator it = tile_list_find(the_tiles, tile);
-        if (it != the_tiles.end()) {
-          tile.buffers = it->buffers;
-          the_tiles.erase(it);
-        }
-
-        assert(tile.buffers != NULL);
-
-        the_task.release_tile(tile);
-
-        lock.lock();
-        RPCSend snd(socket, &error_func, "release_tile");
-        snd.write();
-        lock.unlock();
-      }
-      else if (rcv.name == "task_wait_done") {
-        lock.unlock();
-        break;
-      }
-      else
-        lock.unlock();
-    }
-  }
-
-  void task_cancel()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCSend snd(socket, &error_func, "task_cancel");
-    snd.write();
-  }
-
-  int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
-
- private:
-  NetworkError error_func;
-};
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address)
-{
-  return new NetworkDevice(info, stats, profiler, address);
-}
-
-void device_network_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_NETWORK;
-  info.description = "Network Device";
-  info.id = "NETWORK";
-  info.num = 0;
-
-  /* todo: get this info from device */
-  info.has_volume_decoupled = false;
-  info.has_adaptive_stop_per_sample = false;
-  info.has_osl = false;
-  info.denoisers = DENOISER_NONE;
-
-  devices.push_back(info);
-}
-
-class DeviceServer {
- public:
-  thread_mutex rpc_lock;
-
-  void network_error(const string &message)
-  {
-    error_func.network_error(message);
-  }
-
-  bool have_error()
-  {
-    return error_func.have_error();
-  }
-
-  DeviceServer(Device *device_, tcp::socket &socket_)
-      : device(device_), socket(socket_), stop(false), blocked_waiting(false)
-  {
-    error_func = NetworkError();
-  }
-
-  void listen()
-  {
-    /* receive remote function calls */
-    for (;;) {
-      listen_step();
-
-      if (stop)
-        break;
-    }
-  }
-
- protected:
-  void listen_step()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCReceive rcv(socket, &error_func);
-
-    if (rcv.name == "stop")
-      stop = true;
-    else
-      process(rcv, lock);
-  }
-
-  /* create a memory buffer for a device buffer and insert it into mem_data */
-  DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
-  {
-    /* create a new DataVector and insert it into mem_data */
-    pair<DataMap::iterator, bool> data_ins = mem_data.insert(
-        DataMap::value_type(client_pointer, DataVector()));
-
-    /* make sure it was a unique insertion */
-    assert(data_ins.second);
-
-    /* get a reference to the inserted vector */
-    DataVector &data_v = data_ins.first->second;
-
-    /* size the vector */
-    data_v.resize(data_size);
-
-    return data_v;
-  }
-
-  DataVector &data_vector_find(device_ptr client_pointer)
-  {
-    DataMap::iterator i = mem_data.find(client_pointer);
-    assert(i != mem_data.end());
-    return i->second;
-  }
-
-  /* setup mapping and reverse mapping of client_pointer<->real_pointer */
-  void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
-  {
-    pair<PtrMap::iterator, bool> mapins;
-
-    /* insert mapping from client pointer to our real device pointer */
-    mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
-    assert(mapins.second);
-
-    /* insert reverse mapping from real our device pointer to client pointer */
-    mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
-    assert(mapins.second);
-  }
-
-  device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-    return i->second;
-  }
-
-  device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-
-    device_ptr result = i->second;
-
-    /* erase the mapping */
-    ptr_map.erase(i);
-
-    /* erase the reverse mapping */
-    PtrMap::iterator irev = ptr_imap.find(result);
-    assert(irev != ptr_imap.end());
-    ptr_imap.erase(irev);
-
-    /* erase the data vector */
-    DataMap::iterator idata = mem_data.find(client_pointer);
-    assert(idata != mem_data.end());
-    mem_data.erase(idata);
-
-    return result;
-  }
-
-  /* note that the lock must be already acquired upon entry.
-   * This is necessary because the caller often peeks at
-   * the header and delegates control to here when it doesn't
-   * specifically handle the current RPC.
-   * The lock must be unlocked before returning */
-  void process(RPCReceive &rcv, thread_scoped_lock &lock)
-  {
-    if (rcv.name == "mem_alloc") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      /* Allocate host side data buffer. */
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      DataVector &data_v = data_vector_insert(client_pointer, data_size);
-      mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-
-      /* Perform the allocation on the actual device. */
-      device->mem_alloc(mem);
-
-      /* Store a mapping to/from client_pointer and real device pointer. */
-      pointer_mapping_insert(client_pointer, mem.device_pointer);
-    }
-    else if (rcv.name == "mem_copy_to") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-      }
-
-      /* Copy data from network into memory buffer. */
-      rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
-
-      /* Copy the data from the memory buffer to the device buffer. */
-      device->mem_copy_to(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_copy_from") {
-      string name;
-      network_device_memory mem(device);
-      int y, w, h, elem;
-
-      rcv.read(mem, name);
-      rcv.read(y);
-      rcv.read(w);
-      rcv.read(h);
-      rcv.read(elem);
-
-      device_ptr client_pointer = mem.device_pointer;
-      mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
-      DataVector &data_v = data_vector_find(client_pointer);
-
-      mem.host_pointer = (device_ptr) & (data_v[0]);
-
-      device->mem_copy_from(mem, y, w, h, elem);
-
-      size_t data_size = mem.memory_size();
-
-      RPCSend snd(socket, &error_func, "mem_copy_from");
-      snd.write();
-      snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
-      lock.unlock();
-    }
-    else if (rcv.name == "mem_zero") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
-      }
-
-      /* Zero memory. */
-      device->mem_zero(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_free") {
-      string name;
-      network_device_memory mem(device);
-
-      rcv.read(mem, name);
-      lock.unlock();
-
-      device_ptr client_pointer = mem.device_pointer;
-
-      mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
-      device->mem_free(mem);
-    }
-    else if (rcv.name == "const_copy_to") {
-      string name_string;
-      size_t size;
-
-      rcv.read(name_string);
-      rcv.read(size);
-
-      vector<char> host_vector(size);
-      rcv.read_buffer(&host_vector[0], size);
-      lock.unlock();
-
-      device->const_copy_to(name_string.c_str(), &host_vector[0], size);
-    }
-    else if (rcv.name == "load_kernels") {
-      DeviceRequestedFeatures requested_features;
-      rcv.read(requested_features.experimental);
-      rcv.read(requested_features.max_closure);
-      rcv.read(requested_features.max_nodes_group);
-      rcv.read(requested_features.nodes_features);
-
-      bool result;
-      result = device->load_kernels(requested_features);
-      RPCSend snd(socket, &error_func, "load_kernels");
-      snd.add(result);
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_add") {
-      DeviceTask task;
-
-      rcv.read(task);
-      lock.unlock();
-
-      if (task.buffer)
-        task.buffer = device_ptr_from_client_pointer(task.buffer);
-
-      if (task.rgba_half)
-        task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
-      if (task.rgba_byte)
-        task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
-      if (task.shader_input)
-        task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
-      if (task.shader_output)
-        task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
-      task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
-      task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
-      task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
-                                                  this);
-      task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
-      task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
-      device->task_add(task);
-    }
-    else if (rcv.name == "task_wait") {
-      lock.unlock();
-
-      blocked_waiting = true;
-      device->task_wait();
-      blocked_waiting = false;
-
-      lock.lock();
-      RPCSend snd(socket, &error_func, "task_wait_done");
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_cancel") {
-      lock.unlock();
-      device->task_cancel();
-    }
-    else if (rcv.name == "acquire_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      rcv.read(entry.tile);
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "acquire_tile_none") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "release_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else {
-      cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
-      lock.unlock();
-    }
-  }
-
-  bool task_acquire_tile(Device *, RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    bool result = false;
-
-    RPCSend snd(socket, &error_func, "acquire_tile");
-    snd.write();
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "acquire_tile") {
-          tile = entry.tile;
-
-          if (tile.buffer)
-            tile.buffer = ptr_map[tile.buffer];
-
-          result = true;
-          break;
-        }
-        else if (entry.name == "acquire_tile_none") {
-          break;
-        }
-        else {
-          cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop && !have_error());
-
-    return result;
-  }
-
-  void task_update_progress_sample()
-  {
-    ; /* skip */
-  }
-
-  void task_update_tile_sample(RenderTile &)
-  {
-    ; /* skip */
-  }
-
-  void task_release_tile(RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    if (tile.buffer)
-      tile.buffer = ptr_imap[tile.buffer];
-
-    {
-      thread_scoped_lock lock(rpc_lock);
-      RPCSend snd(socket, &error_func, "release_tile");
-      snd.add(tile);
-      snd.write();
-      lock.unlock();
-    }
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "release_tile") {
-          lock.unlock();
-          break;
-        }
-        else {
-          cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop);
-  }
-
-  bool task_get_cancel()
-  {
-    return false;
-  }
-
-  /* properties */
-  Device *device;
-  tcp::socket &socket;
-
-  /* mapping of remote to local pointer */
-  PtrMap ptr_map;
-  PtrMap ptr_imap;
-  DataMap mem_data;
-
-  struct AcquireEntry {
-    string name;
-    RenderTile tile;
-  };
-
-  thread_mutex acquire_mutex;
-  list<AcquireEntry> acquire_queue;
-
-  bool stop;
-  bool blocked_waiting;
-
- private:
-  NetworkError error_func;
-
-  /* todo: free memory and device (osl) on network error */
-};
-
-void Device::server_run()
-{
-  try {
-    /* starts thread that responds to discovery requests */
-    ServerDiscovery discovery;
-
-    for (;;) {
-      /* accept connection */
-      boost::asio::io_service io_service;
-      tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
-      tcp::socket socket(io_service);
-      acceptor.accept(socket);
-
-      string remote_address = socket.remote_endpoint().address().to_string();
-      printf("Connected to remote client at: %s\n", remote_address.c_str());
-
-      DeviceServer server(this, socket);
-      server.listen();
-
-      printf("Disconnected.\n");
-    }
-  }
-  catch (exception &e) {
-    fprintf(stderr, "Network server exception: %s\n", e.what());
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
deleted file mode 100644
index b3a0f6daa57..00000000000
--- a/intern/cycles/device/device_network.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_NETWORK_H__
-#define __DEVICE_NETWORK_H__
-
-#ifdef WITH_NETWORK
-
-#  include <boost/archive/binary_iarchive.hpp>
-#  include <boost/archive/binary_oarchive.hpp>
-#  include <boost/archive/text_iarchive.hpp>
-#  include <boost/archive/text_oarchive.hpp>
-#  include <boost/array.hpp>
-#  include <boost/asio.hpp>
-#  include <boost/bind.hpp>
-#  include <boost/serialization/vector.hpp>
-#  include <boost/thread.hpp>
-
-#  include <deque>
-#  include <iostream>
-#  include <sstream>
-
-#  include "render/buffers.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_list.h"
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-using std::cerr;
-using std::cout;
-using std::exception;
-using std::hex;
-using std::setw;
-
-using boost::asio::ip::tcp;
-
-static const int SERVER_PORT = 5120;
-static const int DISCOVER_PORT = 5121;
-static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
-static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-
-#  if 0
-typedef boost::archive::text_oarchive o_archive;
-typedef boost::archive::text_iarchive i_archive;
-#  else
-typedef boost::archive::binary_oarchive o_archive;
-typedef boost::archive::binary_iarchive i_archive;
-#  endif
-
-/* Serialization of device memory */
-
-class network_device_memory : public device_memory {
- public:
-  network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
-  {
-  }
-
-  ~network_device_memory()
-  {
-    device_pointer = 0;
-  };
-
-  vector<char> local_data;
-};
-
-/* Common network error function / object for both DeviceNetwork and DeviceServer. */
-class NetworkError {
- public:
-  NetworkError()
-  {
-    error = "";
-    error_count = 0;
-  }
-
-  ~NetworkError()
-  {
-  }
-
-  void network_error(const string &message)
-  {
-    error = message;
-    error_count += 1;
-  }
-
-  bool have_error()
-  {
-    return true ? error_count > 0 : false;
-  }
-
- private:
-  string error;
-  int error_count;
-};
-
-/* Remote procedure call Send */
-
-class RPCSend {
- public:
-  RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
-      : name(name_), socket(socket_), archive(archive_stream), sent(false)
-  {
-    archive &name_;
-    error_func = e;
-    fprintf(stderr, "rpc send %s\n", name.c_str());
-  }
-
-  ~RPCSend()
-  {
-  }
-
-  void add(const device_memory &mem)
-  {
-    archive &mem.data_type &mem.data_elements &mem.data_size;
-    archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    archive &mem.type &string(mem.name);
-    archive &mem.interpolation &mem.extension;
-    archive &mem.device_pointer;
-  }
-
-  template<typename T> void add(const T &data)
-  {
-    archive &data;
-  }
-
-  void add(const DeviceTask &task)
-  {
-    int type = (int)task.type;
-    archive &type &task.x &task.y &task.w &task.h;
-    archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    archive &task.offset &task.stride;
-    archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    archive &task.shader_x &task.shader_w;
-    archive &task.need_finish_queue;
-  }
-
-  void add(const RenderTile &tile)
-  {
-    archive &tile.x &tile.y &tile.w &tile.h;
-    archive &tile.start_sample &tile.num_samples &tile.sample;
-    archive &tile.resolution &tile.offset &tile.stride;
-    archive &tile.buffer;
-  }
-
-  void write()
-  {
-    boost::system::error_code error;
-
-    /* get string from stream */
-    string archive_str = archive_stream.str();
-
-    /* first send fixed size header with size of following data */
-    ostringstream header_stream;
-    header_stream << setw(8) << hex << archive_str.size();
-    string header_str = header_stream.str();
-
-    boost::asio::write(
-        socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    /* then send actual data */
-    boost::asio::write(
-        socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    sent = true;
-  }
-
-  void write_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-
-    boost::asio::write(
-        socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-  }
-
- protected:
-  string name;
-  tcp::socket &socket;
-  ostringstream archive_stream;
-  o_archive archive;
-  bool sent;
-  NetworkError *error_func;
-};
-
-/* Remote procedure call Receive */
-
-class RPCReceive {
- public:
-  RPCReceive(tcp::socket &socket_, NetworkError *e)
-      : socket(socket_), archive_stream(NULL), archive(NULL)
-  {
-    error_func = e;
-    /* read head with fixed size */
-    vector<char> header(8);
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    /* verify if we got something */
-    if (len == header.size()) {
-      /* decode header */
-      string header_str(&header[0], header.size());
-      istringstream header_stream(header_str);
-
-      size_t data_size;
-
-      if ((header_stream >> hex >> data_size)) {
-
-        vector<char> data(data_size);
-        size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
-        if (error.value())
-          error_func->network_error(error.message());
-
-        if (len == data_size) {
-          archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
-
-          archive_stream = new istringstream(archive_str);
-          archive = new i_archive(*archive_stream);
-
-          *archive &name;
-          fprintf(stderr, "rpc receive %s\n", name.c_str());
-        }
-        else {
-          error_func->network_error("Network receive error: data size doesn't match header");
-        }
-      }
-      else {
-        error_func->network_error("Network receive error: can't decode data size from header");
-      }
-    }
-    else {
-      error_func->network_error("Network receive error: invalid header size");
-    }
-  }
-
-  ~RPCReceive()
-  {
-    delete archive;
-    delete archive_stream;
-  }
-
-  void read(network_device_memory &mem, string &name)
-  {
-    *archive &mem.data_type &mem.data_elements &mem.data_size;
-    *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    *archive &mem.type &name;
-    *archive &mem.interpolation &mem.extension;
-    *archive &mem.device_pointer;
-
-    mem.name = name.c_str();
-    mem.host_pointer = 0;
-
-    /* Can't transfer OpenGL texture over network. */
-    if (mem.type == MEM_PIXELS) {
-      mem.type = MEM_READ_WRITE;
-    }
-  }
-
-  template<typename T> void read(T &data)
-  {
-    *archive &data;
-  }
-
-  void read_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    if (len != size)
-      cout << "Network receive error: buffer size doesn't match expected size\n";
-  }
-
-  void read(DeviceTask &task)
-  {
-    int type;
-
-    *archive &type &task.x &task.y &task.w &task.h;
-    *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    *archive &task.offset &task.stride;
-    *archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    *archive &task.shader_x &task.shader_w;
-    *archive &task.need_finish_queue;
-
-    task.type = (DeviceTask::Type)type;
-  }
-
-  void read(RenderTile &tile)
-  {
-    *archive &tile.x &tile.y &tile.w &tile.h;
-    *archive &tile.start_sample &tile.num_samples &tile.sample;
-    *archive &tile.resolution &tile.offset &tile.stride;
-    *archive &tile.buffer;
-
-    tile.buffers = NULL;
-  }
-
-  string name;
-
- protected:
-  tcp::socket &socket;
-  string archive_str;
-  istringstream *archive_stream;
-  i_archive *archive;
-  NetworkError *error_func;
-};
-
-/* Server auto discovery */
-
-class ServerDiscovery {
- public:
-  explicit ServerDiscovery(bool discover = false)
-      : listen_socket(io_service), collect_servers(false)
-  {
-    /* setup listen socket */
-    listen_endpoint.address(boost::asio::ip::address_v4::any());
-    listen_endpoint.port(DISCOVER_PORT);
-
-    listen_socket.open(listen_endpoint.protocol());
-
-    boost::asio::socket_base::reuse_address option(true);
-    listen_socket.set_option(option);
-
-    listen_socket.bind(listen_endpoint);
-
-    /* setup receive callback */
-    async_receive();
-
-    /* start server discovery */
-    if (discover) {
-      collect_servers = true;
-      servers.clear();
-
-      broadcast_message(DISCOVER_REQUEST_MSG);
-    }
-
-    /* start thread */
-    work = new boost::asio::io_service::work(io_service);
-    thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
-  }
-
-  ~ServerDiscovery()
-  {
-    io_service.stop();
-    thread->join();
-    delete thread;
-    delete work;
-  }
-
-  vector<string> get_server_list()
-  {
-    vector<string> result;
-
-    mutex.lock();
-    result = vector<string>(servers.begin(), servers.end());
-    mutex.unlock();
-
-    return result;
-  }
-
- private:
-  void handle_receive_from(const boost::system::error_code &error, size_t size)
-  {
-    if (error) {
-      cout << "Server discovery receive error: " << error.message() << "\n";
-      return;
-    }
-
-    if (size > 0) {
-      string msg = string(receive_buffer, size);
-
-      /* handle incoming message */
-      if (collect_servers) {
-        if (msg == DISCOVER_REPLY_MSG) {
-          string address = receive_endpoint.address().to_string();
-
-          mutex.lock();
-
-          /* add address if it's not already in the list */
-          bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
-
-          if (!found)
-            servers.push_back(address);
-
-          mutex.unlock();
-        }
-      }
-      else {
-        /* reply to request */
-        if (msg == DISCOVER_REQUEST_MSG)
-          broadcast_message(DISCOVER_REPLY_MSG);
-      }
-    }
-
-    async_receive();
-  }
-
-  void async_receive()
-  {
-    listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
-                                     receive_endpoint,
-                                     boost::bind(&ServerDiscovery::handle_receive_from,
-                                                 this,
-                                                 boost::asio::placeholders::error,
-                                                 boost::asio::placeholders::bytes_transferred));
-  }
-
-  void broadcast_message(const string &msg)
-  {
-    /* setup broadcast socket */
-    boost::asio::ip::udp::socket socket(io_service);
-
-    socket.open(boost::asio::ip::udp::v4());
-
-    boost::asio::socket_base::broadcast option(true);
-    socket.set_option(option);
-
-    boost::asio::ip::udp::endpoint broadcast_endpoint(
-        boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
-    /* broadcast message */
-    socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
-  }
-
-  /* network service and socket */
-  boost::asio::io_service io_service;
-  boost::asio::ip::udp::endpoint listen_endpoint;
-  boost::asio::ip::udp::socket listen_socket;
-
-  /* threading */
-  boost::thread *thread;
-  boost::asio::io_service::work *work;
-  boost::mutex mutex;
-
-  /* buffer and endpoint for receiving messages */
-  char receive_buffer[256];
-  boost::asio::ip::udp::endpoint receive_endpoint;
-
-  // os, version, devices, status, host name, group name, ip as far as fields go
-  struct ServerInfo {
-    string cycles_version;
-    string os;
-    int device_count;
-    string status;
-    string host_name;
-    string group_name;
-    string host_addr;
-  };
-
-  /* collection of server addresses in list */
-  bool collect_servers;
-  vector<string> servers;
-};
-
-CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
deleted file mode 100644
index 9abb7cfb7fe..00000000000
--- a/intern/cycles/device/device_opencl.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/device.h"
-#  include "device/device_intern.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_set.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return opencl_create_split_device(info, stats, profiler, background);
-}
-
-bool device_opencl_init()
-{
-  static bool initialized = false;
-  static bool result = false;
-
-  if (initialized)
-    return result;
-
-  initialized = true;
-
-  if (OpenCLInfo::device_type() != 0) {
-    int clew_result = clewInit();
-    if (clew_result == CLEW_SUCCESS) {
-      VLOG(1) << "CLEW initialization succeeded.";
-      result = true;
-    }
-    else {
-      VLOG(1) << "CLEW initialization failed: "
-              << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
-                                                              "Error opening the library");
-    }
-  }
-  else {
-    VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
-    result = false;
-  }
-
-  return result;
-}
-
-static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
-{
-#  ifdef _WIN32
-  __try {
-    return clGetPlatformIDs(0, NULL, num_platforms);
-  }
-  __except (EXCEPTION_EXECUTE_HANDLER) {
-    /* Ignore crashes inside the OpenCL driver and hope we can
-     * survive even with corrupted OpenCL installs. */
-    fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
-  }
-
-  *num_platforms = 0;
-  return CL_DEVICE_NOT_FOUND;
-#  else
-  return clGetPlatformIDs(0, NULL, num_platforms);
-#  endif
-}
-
-void device_opencl_info(vector<DeviceInfo> &devices)
-{
-  cl_uint num_platforms = 0;
-  device_opencl_get_num_platforms_safe(&num_platforms);
-  if (num_platforms == 0) {
-    return;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  /* Devices are numbered consecutively across platforms. */
-  int num_devices = 0;
-  set<string> unique_ids;
-  foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
-    /* Compute unique ID for persistent user preferences. */
-    const string &platform_name = platform_device.platform_name;
-    const string &device_name = platform_device.device_name;
-    string hardware_id = platform_device.hardware_id;
-    if (hardware_id == "") {
-      hardware_id = string_printf("ID_%d", num_devices);
-    }
-    string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
-    /* Hardware ID might not be unique, add device number in that case. */
-    if (unique_ids.find(id) != unique_ids.end()) {
-      id += string_printf("_ID_%d", num_devices);
-    }
-    unique_ids.insert(id);
-
-    /* Create DeviceInfo. */
-    DeviceInfo info;
-    info.type = DEVICE_OPENCL;
-    info.description = string_remove_trademark(string(device_name));
-    info.num = num_devices;
-    /* We don't know if it's used for display, but assume it is. */
-    info.display_device = true;
-    info.use_split_kernel = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
-    info.id = id;
-
-    /* Check OpenCL extensions */
-    info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
-    /* Disabled for now due to apparent AMD driver bug. */
-    info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing";
-
-    devices.push_back(info);
-    num_devices++;
-  }
-}
-
-string device_opencl_capabilities()
-{
-  if (OpenCLInfo::device_type() == 0) {
-    return "All OpenCL devices are forced to be OFF";
-  }
-  string result = "";
-  string error_msg = ""; /* Only used by opencl_assert(), but in the future
-                          * it could also be nicely reported to the console.
-                          */
-  cl_uint num_platforms = 0;
-  opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
-  if (num_platforms == 0) {
-    return "No OpenCL platforms found\n";
-  }
-  result += string_printf("Number of platforms: %u\n", num_platforms);
-
-  vector<cl_platform_id> platform_ids;
-  platform_ids.resize(num_platforms);
-  opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-#  define APPEND_INFO(func, id, name, what, type) \
-    do { \
-      type data; \
-      memset(&data, 0, sizeof(data)); \
-      opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
-      result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
-    } while (false)
-#  define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \
-    do { \
-      string value; \
-      size_t length = 0; \
-      if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \
-        vector<char> buffer(length + 1); \
-        if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \
-          value = string(buffer.data()); \
-        } \
-      } \
-      if (is_optional && !(length != 0 && value[0] != '\0')) { \
-        break; \
-      } \
-      result += string_printf("%s: %s\n", name, value.c_str()); \
-    } while (false)
-#  define APPEND_PLATFORM_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false)
-#  define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true)
-#  define APPEND_PLATFORM_INFO(id, name, what, type) \
-    APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-#  define APPEND_DEVICE_INFO(id, name, what, type) \
-    APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-#  define APPEND_DEVICE_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false)
-#  define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true)
-
-  vector<cl_device_id> device_ids;
-  for (cl_uint platform = 0; platform < num_platforms; ++platform) {
-    cl_platform_id platform_id = platform_ids[platform];
-
-    result += string_printf("Platform #%u\n", platform);
-
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
-
-    cl_uint num_devices = 0;
-    opencl_assert(
-        clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
-    result += string_printf("\tNumber of devices: %u\n", num_devices);
-
-    device_ids.resize(num_devices);
-    opencl_assert(clGetDeviceIDs(
-        platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
-    for (cl_uint device = 0; device < num_devices; ++device) {
-      cl_device_id device_id = device_ids[device];
-
-      result += string_printf("\t\tDevice: #%u\n", device);
-
-      APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
-      APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
-      APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
-      APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
-      APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
-      APPEND_DEVICE_INFO(
-          device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
-    }
-  }
-
-#  undef APPEND_INFO
-#  undef APPEND_STRING_INFO_IMPL
-#  undef APPEND_PLATFORM_STRING_INFO
-#  undef APPEND_STRING_EXTENSION_INFO
-#  undef APPEND_PLATFORM_INFO
-#  undef APPEND_DEVICE_INFO
-#  undef APPEND_DEVICE_STRING_INFO
-#  undef APPEND_DEVICE_STRING_EXTENSION_INFO
-
-  return result;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
deleted file mode 100644
index 6f9a7943722..00000000000
--- a/intern/cycles/device/device_optix.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPTIX
-
-#  include "bvh/bvh.h"
-#  include "bvh/bvh_optix.h"
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_denoising.h"
-#  include "device/device_intern.h"
-#  include "render/buffers.h"
-#  include "render/hair.h"
-#  include "render/mesh.h"
-#  include "render/object.h"
-#  include "render/scene.h"
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_progress.h"
-#  include "util/util_time.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include <cuew.h>
-// Do not use CUDA SDK headers when using CUEW
-#    define OPTIX_DONT_INCLUDE_CUDA
-#  endif
-#  include <optix_function_table_definition.h>
-#  include <optix_stubs.h>
-
-// TODO(pmours): Disable this once drivers have native support
-#  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
-
-CCL_NAMESPACE_BEGIN
-
-/* Make sure this stays in sync with kernel_globals.h */
-struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-};
-struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-};
-
-#  define check_result_cuda(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_cuda_ret(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define check_result_optix(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_optix_ret(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define launch_filter_kernel(func_name, w, h, args) \
-    { \
-      CUfunction func; \
-      check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
-      check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
-      int threads; \
-      check_result_cuda_ret( \
-          cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-      threads = (int)sqrt((float)threads); \
-      int xblocks = ((w) + threads - 1) / threads; \
-      int yblocks = ((h) + threads - 1) / threads; \
-      check_result_cuda_ret( \
-          cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
-    } \
-    (void)0
-
-class OptiXDevice : public CUDADevice {
-
-  // List of OptiX program groups
-  enum {
-    PG_RGEN,
-    PG_MISS,
-    PG_HITD,  // Default hit group
-    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
-#  if OPTIX_ABI_VERSION >= 36
-    PG_HITD_MOTION,
-    PG_HITS_MOTION,
-#  endif
-    PG_BAKE,  // kernel_bake_evaluate
-    PG_DISP,  // kernel_displace_evaluate
-    PG_BACK,  // kernel_background_evaluate
-    PG_CALL,
-    NUM_PROGRAM_GROUPS = PG_CALL + 3
-  };
-
-  // List of OptiX pipelines
-  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
-
-  // A single shader binding table entry
-  struct SbtRecord {
-    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
-  };
-
-  // Information stored about CUDA memory allocations
-  struct CUDAMem {
-    bool free_map_host = false;
-    CUarray array = NULL;
-    CUtexObject texobject = 0;
-    bool use_mapped_host = false;
-  };
-
-  // Helper class to manage current CUDA context
-  struct CUDAContextScope {
-    CUDAContextScope(CUcontext ctx)
-    {
-      cuCtxPushCurrent(ctx);
-    }
-    ~CUDAContextScope()
-    {
-      cuCtxPopCurrent(NULL);
-    }
-  };
-
-  // Use a pool with multiple threads to support launches with multiple CUDA streams
-  TaskPool task_pool;
-
-  vector<CUstream> cuda_stream;
-  OptixDeviceContext context = NULL;
-
-  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
-  OptixModule builtin_modules[2] = {};
-  OptixPipeline pipelines[NUM_PIPELINES] = {};
-
-  bool motion_blur = false;
-  device_vector<SbtRecord> sbt_data;
-  device_only_memory<KernelParams> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
-
-  OptixDenoiser denoiser = NULL;
-  device_only_memory<unsigned char> denoiser_state;
-  int denoiser_input_passes = 0;
-
-  vector<device_only_memory<char>> delayed_free_bvh_memory;
-  thread_mutex delayed_free_bvh_mutex;
-
- public:
-  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : CUDADevice(info_, stats_, profiler_, background_),
-        sbt_data(this, "__sbt", MEM_READ_ONLY),
-        launch_params(this, "__params", false),
-        denoiser_state(this, "__denoiser_state", true)
-  {
-    // Store number of CUDA streams in device info
-    info.cpu_threads = DebugFlags().optix.cuda_streams;
-
-    // Make the CUDA context current
-    if (!cuContext) {
-      return;  // Do not initialize if CUDA context creation failed already
-    }
-    const CUDAContextScope scope(cuContext);
-
-    // Create OptiX context for this device
-    OptixDeviceContextOptions options = {};
-#  ifdef WITH_CYCLES_LOGGING
-    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
-    options.logCallbackFunction =
-        [](unsigned int level, const char *, const char *message, void *) {
-          switch (level) {
-            case 1:
-              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
-              break;
-            case 2:
-              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
-              break;
-            case 3:
-              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
-              break;
-            case 4:
-              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
-              break;
-          }
-        };
-#  endif
-    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
-#  ifdef WITH_CYCLES_LOGGING
-    check_result_optix(optixDeviceContextSetLogCallback(
-        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
-#  endif
-
-    // Create launch streams
-    cuda_stream.resize(info.cpu_threads);
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
-
-    // Fix weird compiler bug that assigns wrong size
-    launch_params.data_elements = sizeof(KernelParams);
-    // Allocate launch parameter buffer memory on device
-    launch_params.alloc_to_device(info.cpu_threads);
-  }
-  ~OptiXDevice()
-  {
-    // Stop processing any more tasks
-    task_pool.cancel();
-
-    // Make CUDA context current
-    const CUDAContextScope scope(cuContext);
-
-    free_bvh_memory_delayed();
-
-    sbt_data.free();
-    texture_info.free();
-    launch_params.free();
-    denoiser_state.free();
-
-    // Unload modules
-    if (optix_module != NULL)
-      optixModuleDestroy(optix_module);
-    for (unsigned int i = 0; i < 2; ++i)
-      if (builtin_modules[i] != NULL)
-        optixModuleDestroy(builtin_modules[i]);
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
-      if (pipelines[i] != NULL)
-        optixPipelineDestroy(pipelines[i]);
-
-    // Destroy launch streams
-    for (CUstream stream : cuda_stream)
-      cuStreamDestroy(stream);
-
-    if (denoiser != NULL)
-      optixDenoiserDestroy(denoiser);
-
-    optixDeviceContextDestroy(context);
-  }
-
- private:
-  bool show_samples() const override
-  {
-    // Only show samples if not rendering multiple tiles in parallel
-    return info.cpu_threads == 1;
-  }
-
-  BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
-    if (optix_module == NULL)
-      return CUDADevice::get_bvh_layout_mask();
-
-    // OptiX has its own internal acceleration structure format
-    return BVH_LAYOUT_OPTIX;
-  }
-
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter,
-                                          bool /*split*/) override
-  {
-    // Split kernel is not supported in OptiX
-    string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
-        requested_features, filter, false);
-
-    // Add OptiX SDK include directory to include paths
-    const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
-    if (optix_sdk_path) {
-      common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
-    }
-
-    // Specialization for shader raytracing
-    if (requested_features.use_shader_raytrace) {
-      common_cflags += " --keep-device-functions";
-    }
-    else {
-      common_cflags += " -D __NO_SHADER_RAYTRACE__";
-    }
-
-    return common_cflags;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    if (have_error()) {
-      // Abort early if context creation failed already
-      return false;
-    }
-
-    // Load CUDA modules because we need some of the utility kernels
-    if (!CUDADevice::load_kernels(requested_features)) {
-      return false;
-    }
-
-    // Baking is currently performed using CUDA, so no need to load OptiX kernels
-    if (requested_features.use_baking) {
-      return true;
-    }
-
-    const CUDAContextScope scope(cuContext);
-
-    // Unload existing OptiX module and pipelines first
-    if (optix_module != NULL) {
-      optixModuleDestroy(optix_module);
-      optix_module = NULL;
-    }
-    for (unsigned int i = 0; i < 2; ++i) {
-      if (builtin_modules[i] != NULL) {
-        optixModuleDestroy(builtin_modules[i]);
-        builtin_modules[i] = NULL;
-      }
-    }
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
-      if (pipelines[i] != NULL) {
-        optixPipelineDestroy(pipelines[i]);
-        pipelines[i] = NULL;
-      }
-    }
-
-    OptixModuleCompileOptions module_options = {};
-    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
-    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
-    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-
-#  if OPTIX_ABI_VERSION >= 41
-    module_options.boundValues = nullptr;
-    module_options.numBoundValues = 0;
-#  endif
-
-    OptixPipelineCompileOptions pipeline_options = {};
-    // Default to no motion blur and two-level graph, since it is the fastest option
-    pipeline_options.usesMotionBlur = false;
-    pipeline_options.traversableGraphFlags =
-        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
-    pipeline_options.numPayloadValues = 6;
-    pipeline_options.numAttributeValues = 2;  // u, v
-    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
-    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
-
-#  if OPTIX_ABI_VERSION >= 36
-    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
-    if (requested_features.use_hair) {
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
-      }
-      else {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
-      }
-    }
-#  endif
-
-    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
-    // This is necessary since objects may be reported to have motion if the Vector pass is
-    // active, but may still need to be rendered without motion blur if that isn't active as well
-    motion_blur = requested_features.use_object_motion;
-
-    if (motion_blur) {
-      pipeline_options.usesMotionBlur = true;
-      // Motion blur can insert motion transforms into the traversal graph
-      // It is no longer a two-level graph then, so need to set flags to allow any configuration
-      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
-    }
-
-    {  // Load and compile PTX module with OptiX kernels
-      string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
-                                                   "lib/kernel_optix_shader_raytrace.ptx" :
-                                                   "lib/kernel_optix.ptx");
-      if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
-        if (!getenv("OPTIX_ROOT_DIR")) {
-          set_error(
-              "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
-              "the Optix SDK to be able to compile Optix kernels on demand).");
-          return false;
-        }
-        ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
-      }
-      if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
-        set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
-        return false;
-      }
-
-      check_result_optix_ret(optixModuleCreateFromPTX(context,
-                                                      &module_options,
-                                                      &pipeline_options,
-                                                      ptx_data.data(),
-                                                      ptx_data.size(),
-                                                      nullptr,
-                                                      0,
-                                                      &optix_module));
-    }
-
-    // Create program groups
-    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupOptions group_options = {};  // There are no options currently
-    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-    group_descs[PG_RGEN].raygen.module = optix_module;
-    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
-    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
-    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
-    group_descs[PG_MISS].miss.module = optix_module;
-    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
-    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
-    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
-    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
-
-    if (requested_features.use_hair) {
-      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
-      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-
-      // Add curve intersection programs
-      if (requested_features.use_hair_thick) {
-        // Slower programs for thick hair since that also slows down ribbons.
-        // Ideally this should not be needed.
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-      }
-      else {
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-      }
-
-#  if OPTIX_ABI_VERSION >= 36
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        OptixBuiltinISOptions builtin_options = {};
-        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-        builtin_options.usesMotionBlur = false;
-
-        check_result_optix_ret(optixBuiltinISModuleGet(
-            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
-
-        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
-        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
-
-        if (motion_blur) {
-          builtin_options.usesMotionBlur = true;
-
-          check_result_optix_ret(optixBuiltinISModuleGet(
-              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
-
-          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
-          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
-          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
-          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
-        }
-      }
-#  endif
-    }
-
-    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
-      // Add hit group for local intersections
-      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
-      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
-    }
-
-    if (requested_features.use_baking) {
-      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BAKE].raygen.module = optix_module;
-      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
-    }
-
-    if (requested_features.use_true_displacement) {
-      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_DISP].raygen.module = optix_module;
-      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
-    }
-
-    if (requested_features.use_background_light) {
-      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BACK].raygen.module = optix_module;
-      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
-    }
-
-    // Shader raytracing replaces some functions with direct callables
-    if (requested_features.use_shader_raytrace) {
-      group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
-      group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
-          "__direct_callable__kernel_volume_shadow";
-      group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
-          "__direct_callable__subsurface_scatter_multi_setup";
-    }
-
-    check_result_optix_ret(optixProgramGroupCreate(
-        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
-
-    // Get program stack sizes
-    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
-    // Set up SBT, which in this case is used only to select between different programs
-    sbt_data.alloc(NUM_PROGRAM_GROUPS);
-    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
-      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
-    }
-    sbt_data.copy_to_device();  // Upload SBT to device
-
-    // Calculate maximum trace continuation stack size
-    unsigned int trace_css = stack_size[PG_HITD].cssCH;
-    // This is based on the maximum of closest-hit and any-hit/intersection programs
-    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-#  if OPTIX_ABI_VERSION >= 36
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
-#  endif
-
-    OptixPipelineLinkOptions link_options = {};
-    link_options.maxTraceDepth = 1;
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-#  if OPTIX_ABI_VERSION < 24
-    link_options.overrideUsesMotionBlur = motion_blur;
-#  endif
-
-    {  // Create path tracing pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_RGEN]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_PATH_TRACE]));
-
-      // Combine ray generation and trace continuation stack size
-      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
-      // Max direct callable depth is one of the following, so combine accordingly
-      // - __raygen__ -> svm_eval_nodes
-      // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
-      // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      // Set stack size depending on pipeline options
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Only need to create shader evaluation pipeline if one of these features is used:
-    const bool use_shader_eval_pipeline = requested_features.use_baking ||
-                                          requested_features.use_background_light ||
-                                          requested_features.use_true_displacement;
-
-    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_BAKE]);
-      pipeline_groups.push_back(groups[PG_DISP]);
-      pipeline_groups.push_back(groups[PG_BACK]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_SHADER_EVAL]));
-
-      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
-      const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
-                                        std::max(stack_size[PG_DISP].cssRG,
-                                                 stack_size[PG_BACK].cssRG)) +
-                               link_options.maxTraceDepth * trace_css;
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Clean up program group objects
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      optixProgramGroupDestroy(groups[i]);
-    }
-
-    return true;
-  }
-
-  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
-  {
-    if (have_error())
-      return;  // Abort early if there was an error previously
-
-    if (task.type == DeviceTask::RENDER) {
-      if (thread_index != 0) {
-        // Only execute denoising in a single thread (see also 'task_add')
-        task.tile_types &= ~RenderTile::DENOISE;
-      }
-
-      RenderTile tile;
-      while (task.acquire_tile(this, tile, task.tile_types)) {
-        if (tile.task == RenderTile::PATH_TRACE)
-          launch_render(task, tile, thread_index);
-        else if (tile.task == RenderTile::BAKE) {
-          // Perform baking using CUDA, since it is not currently implemented in OptiX
-          device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-          CUDADevice::render(task, tile, work_tiles);
-        }
-        else if (tile.task == RenderTile::DENOISE)
-          launch_denoise(task, tile);
-        task.release_tile(tile);
-        if (task.get_cancel() && !task.need_finish_queue)
-          break;  // User requested cancellation
-        else if (have_error())
-          break;  // Abort rendering when encountering an error
-      }
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      // CUDA kernels are used when doing baking
-      if (optix_module == NULL)
-        CUDADevice::shader(task);
-      else
-        launch_shader_eval(task, thread_index);
-    }
-    else if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Set up a single tile that covers the whole task and denoise it
-      RenderTile tile;
-      tile.x = task.x;
-      tile.y = task.y;
-      tile.w = task.w;
-      tile.h = task.h;
-      tile.buffer = task.buffer;
-      tile.num_samples = task.num_samples;
-      tile.start_sample = task.sample;
-      tile.offset = task.offset;
-      tile.stride = task.stride;
-      tile.buffers = task.buffers;
-
-      launch_denoise(task, tile);
-    }
-  }
-
-  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
-  {
-    assert(thread_index < launch_params.data_size);
-
-    // Keep track of total render time of this tile
-    const scoped_timer timer(&rtile.buffers->render_time);
-
-    WorkTile wtile;
-    wtile.x = rtile.x;
-    wtile.y = rtile.y;
-    wtile.w = rtile.w;
-    wtile.h = rtile.h;
-    wtile.offset = rtile.offset;
-    wtile.stride = rtile.stride;
-    wtile.buffer = (float *)rtile.buffer;
-
-    const int end_sample = rtile.start_sample + rtile.num_samples;
-    // Keep this number reasonable to avoid running into TDRs
-    int step_samples = (info.display_device ? 8 : 32);
-
-    // Offset into launch params buffer so that streams use separate data
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    const CUDAContextScope scope(cuContext);
-
-    for (int sample = rtile.start_sample; sample < end_sample;) {
-      // Copy work tile information to device
-      wtile.start_sample = sample;
-      wtile.num_samples = step_samples;
-      if (task.adaptive_sampling.use) {
-        wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-      }
-      wtile.num_samples = min(wtile.num_samples, end_sample - sample);
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      check_result_cuda(
-          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      // Launch the ray generation program
-      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     // Launch with samples close to each other for better locality
-                                     wtile.w * wtile.num_samples,
-                                     wtile.h,
-                                     1));
-
-      // Run the adaptive sampling kernels at selected samples aligned to step samples.
-      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      }
-
-      // Wait for launch to finish
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      // Update current sample, so it is displayed correctly
-      sample += wtile.num_samples;
-      rtile.sample = sample;
-      // Update task progress after the kernel completed rendering
-      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
-
-      if (task.get_cancel() && !task.need_finish_queue)
-        return;  // Cancel rendering
-    }
-
-    // Finalize adaptive sampling
-    if (task.adaptive_sampling.use) {
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
-    }
-  }
-
-  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
-  {
-    // Update current sample (for display and NLM denoising task)
-    rtile.sample = rtile.start_sample + rtile.num_samples;
-
-    // Make CUDA context current now, since it is used for both denoising tasks
-    const CUDAContextScope scope(cuContext);
-
-    // Choose between OptiX and NLM denoising
-    if (task.denoising.type == DENOISER_OPTIX) {
-      // Map neighboring tiles onto this device, indices are as following:
-      // Where index 4 is the center tile and index 9 is the target for the result.
-      //   0 1 2
-      //   3 4 5
-      //   6 7 8  9
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      RenderTile &target_tile = neighbors.target;
-      rtile = center_tile;  // Tile may have been modified by mapping code
-
-      // Calculate size of the tile to denoise (including overlap)
-      int4 rect = center_tile.bounds();
-      // Overlap between tiles has to be at least 64 pixels
-      // TODO(pmours): Query this value from OptiX
-      rect = rect_expand(rect, 64);
-      int4 clip_rect = neighbors.bounds();
-      rect = rect_clip(rect, clip_rect);
-      int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-      int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
-
-      // Calculate byte offsets and strides
-      int pixel_stride = task.pass_stride * (int)sizeof(float);
-      int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
-      const int pass_offset[3] = {
-          (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
-
-      // Start with the current tile pointer offset
-      int input_stride = pixel_stride;
-      device_ptr input_ptr = rtile.buffer + pixel_offset;
-
-      // Copy tile data into a common buffer if necessary
-      device_only_memory<float> input(this, "denoiser input", true);
-      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
-
-      bool contiguous_memory = true;
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
-          contiguous_memory = false;
-        }
-      }
-
-      if (contiguous_memory) {
-        // Tiles are in continous memory, so can just subtract overlap offset
-        input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
-        // Stride covers the whole width of the image and not just a single tile
-        input_stride *= rtile.stride;
-      }
-      else {
-        // Adjacent tiles are in separate memory regions, so need to copy them into a single one
-        input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
-        // Start with the new input buffer
-        input_ptr = input.device_pointer;
-        // Stride covers the width of the new input buffer, which includes tile width and overlap
-        input_stride *= rect_size.x;
-
-        TileInfo *tile_info = tile_info_mem.alloc(1);
-        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-          tile_info->offsets[i] = neighbors.tiles[i].offset;
-          tile_info->strides[i] = neighbors.tiles[i].stride;
-          tile_info->buffers[i] = neighbors.tiles[i].buffer;
-        }
-        tile_info->x[0] = neighbors.tiles[3].x;
-        tile_info->x[1] = neighbors.tiles[4].x;
-        tile_info->x[2] = neighbors.tiles[5].x;
-        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-        tile_info->y[0] = neighbors.tiles[1].y;
-        tile_info->y[1] = neighbors.tiles[4].y;
-        tile_info->y[2] = neighbors.tiles[7].y;
-        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-        tile_info_mem.copy_to_device();
-
-        void *args[] = {
-            &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
-        launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
-      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
-
-      void *input_args[] = {&input_rgb.device_pointer,
-                            &input_ptr,
-                            &rect_size.x,
-                            &rect_size.y,
-                            &input_stride,
-                            &task.pass_stride,
-                            const_cast<int *>(pass_offset),
-                            &task.denoising.input_passes,
-                            &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
-
-      input_ptr = input_rgb.device_pointer;
-      pixel_stride = 3 * sizeof(float);
-      input_stride = rect_size.x * pixel_stride;
-#  endif
-
-      const bool recreate_denoiser = (denoiser == NULL) ||
-                                     (task.denoising.input_passes != denoiser_input_passes);
-      if (recreate_denoiser) {
-        // Destroy existing handle before creating new one
-        if (denoiser != NULL) {
-          optixDenoiserDestroy(denoiser);
-        }
-
-        // Create OptiX denoiser handle on demand when it is first used
-        OptixDenoiserOptions denoiser_options = {};
-        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
-#  if OPTIX_ABI_VERSION >= 47
-        denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
-        denoiser_options.guideNormal = task.denoising.input_passes >= 3;
-        check_result_optix_ret(optixDenoiserCreate(
-            context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
-#  else
-        denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
-            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
-#    if OPTIX_ABI_VERSION < 28
-        denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-#    endif
-        check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
-        check_result_optix_ret(
-            optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
-#  endif
-
-        // OptiX denoiser handle was created with the requested number of input passes
-        denoiser_input_passes = task.denoising.input_passes;
-      }
-
-      OptixDenoiserSizes sizes = {};
-      check_result_optix_ret(
-          optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
-
-#  if OPTIX_ABI_VERSION < 28
-      const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
-#  else
-      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
-#  endif
-      const size_t scratch_offset = sizes.stateSizeInBytes;
-
-      // Allocate denoiser state if tile size has changed since last setup
-      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
-                                denoiser_state.data_height != rect_size.y)) {
-        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
-
-        // Initialize denoiser state for the current tile size
-        check_result_optix_ret(optixDenoiserSetup(denoiser,
-                                                  0,
-                                                  rect_size.x,
-                                                  rect_size.y,
-                                                  denoiser_state.device_pointer,
-                                                  scratch_offset,
-                                                  denoiser_state.device_pointer + scratch_offset,
-                                                  scratch_size));
-
-        denoiser_state.data_width = rect_size.x;
-        denoiser_state.data_height = rect_size.y;
-      }
-
-      // Set up input and output layer information
-      OptixImage2D input_layers[3] = {};
-      OptixImage2D output_layers[1] = {};
-
-      for (int i = 0; i < 3; ++i) {
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-        input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
-#  else
-        input_layers[i].data = input_ptr + pass_offset[i];
-#  endif
-        input_layers[i].width = rect_size.x;
-        input_layers[i].height = rect_size.y;
-        input_layers[i].rowStrideInBytes = input_stride;
-        input_layers[i].pixelStrideInBytes = pixel_stride;
-        input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      output_layers[0].data = input_ptr;
-      output_layers[0].width = rect_size.x;
-      output_layers[0].height = rect_size.y;
-      output_layers[0].rowStrideInBytes = input_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-      int2 output_offset = overlap_offset;
-      overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
-#  else
-      output_layers[0].data = target_tile.buffer + pixel_offset;
-      output_layers[0].width = target_tile.w;
-      output_layers[0].height = target_tile.h;
-      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-#  endif
-      output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-
-#  if OPTIX_ABI_VERSION >= 47
-      OptixDenoiserLayer image_layers = {};
-      image_layers.input = input_layers[0];
-      image_layers.output = output_layers[0];
-
-      OptixDenoiserGuideLayer guide_layers = {};
-      guide_layers.albedo = input_layers[1];
-      guide_layers.normal = input_layers[2];
-#  endif
-
-      // Finally run denonising
-      OptixDenoiserParams params = {};  // All parameters are disabled/zero
-#  if OPTIX_ABI_VERSION >= 47
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 &guide_layers,
-                                                 &image_layers,
-                                                 1,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  else
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 input_layers,
-                                                 task.denoising.input_passes,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 output_layers,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  endif
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      void *output_args[] = {&input_ptr,
-                             &target_tile.buffer,
-                             &output_offset.x,
-                             &output_offset.y,
-                             &rect_size.x,
-                             &rect_size.y,
-                             &target_tile.x,
-                             &target_tile.y,
-                             &target_tile.w,
-                             &target_tile.h,
-                             &target_tile.offset,
-                             &target_tile.stride,
-                             &task.pass_stride,
-                             &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
-#  endif
-
-      check_result_cuda_ret(cuStreamSynchronize(0));
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-    else {
-      // Run CUDA denoising kernels
-      DenoisingTask denoising(this, task);
-      CUDADevice::denoise(rtile, denoising);
-    }
-
-    // Update task progress after the denoiser completed processing
-    task.update_progress(&rtile, rtile.w * rtile.h);
-
-    return true;
-  }
-
-  void launch_shader_eval(DeviceTask &task, int thread_index)
-  {
-    unsigned int rgen_index = PG_BACK;
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
-      rgen_index = PG_BAKE;
-    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
-      rgen_index = PG_DISP;
-
-    const CUDAContextScope scope(cuContext);
-
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    for (int sample = 0; sample < task.num_samples; ++sample) {
-      ShaderParams params;
-      params.input = (uint4 *)task.shader_input;
-      params.output = (float4 *)task.shader_output;
-      params.type = task.shader_eval_type;
-      params.filter = task.shader_filter;
-      params.sx = task.shader_x;
-      params.offset = task.offset;
-      params.sample = sample;
-
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
-                                          &params,
-                                          sizeof(params),
-                                          cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     task.shader_w,
-                                     1,
-                                     1));
-
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      task.update_progress(NULL);
-    }
-  }
-
-  bool build_optix_bvh(BVHOptiX *bvh,
-                       OptixBuildOperation operation,
-                       const OptixBuildInput &build_input,
-                       uint16_t num_motion_steps)
-  {
-    /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
-     * from running out of memory (since both original and compacted acceleration structure memory
-     * may be allocated at the same time for the duration of this function). The builds would
-     * otherwise happen on the same CUDA stream anyway. */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    const CUDAContextScope scope(cuContext);
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    // Compute memory usage
-    OptixAccelBufferSizes sizes = {};
-    OptixAccelBuildOptions options = {};
-    options.operation = operation;
-    if (use_fast_trace_bvh) {
-      VLOG(2) << "Using fast to trace OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
-    }
-    else {
-      VLOG(2) << "Using fast to update OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
-    }
-
-    options.motionOptions.numKeys = num_motion_steps;
-    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
-    options.motionOptions.timeBegin = 0.0f;
-    options.motionOptions.timeEnd = 1.0f;
-
-    check_result_optix_ret(
-        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
-
-    // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
-    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
-    if (!temp_mem.device_pointer)
-      return false;  // Make sure temporary memory allocation succeeded
-
-    // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
-    device_only_memory<char> &out_data = bvh->as_data;
-    if (operation == OPTIX_BUILD_OPERATION_BUILD) {
-      assert(out_data.device == this);
-      out_data.alloc_to_device(sizes.outputSizeInBytes);
-      if (!out_data.device_pointer)
-        return false;
-    }
-    else {
-      assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
-    }
-
-    // Finally build the acceleration structure
-    OptixAccelEmitDesc compacted_size_prop = {};
-    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
-    // A tiny space was allocated for this property at the end of the temporary buffer above
-    // Make sure this pointer is 8-byte aligned
-    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
-
-    OptixTraversableHandle out_handle = 0;
-    check_result_optix_ret(optixAccelBuild(context,
-                                           NULL,
-                                           &options,
-                                           &build_input,
-                                           1,
-                                           temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
-                                           sizes.outputSizeInBytes,
-                                           &out_handle,
-                                           use_fast_trace_bvh ? &compacted_size_prop : NULL,
-                                           use_fast_trace_bvh ? 1 : 0));
-    bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-    // Wait for all operations to finish
-    check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-    // Compact acceleration structure to save memory (only if using fast trace as the
-    // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
-    if (use_fast_trace_bvh) {
-      uint64_t compacted_size = sizes.outputSizeInBytes;
-      check_result_cuda_ret(
-          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
-
-      // Temporary memory is no longer needed, so free it now to make space
-      temp_mem.free();
-
-      // There is no point compacting if the size does not change
-      if (compacted_size < sizes.outputSizeInBytes) {
-        device_only_memory<char> compacted_data(this, "optix compacted as", false);
-        compacted_data.alloc_to_device(compacted_size);
-        if (!compacted_data.device_pointer)
-          // Do not compact if memory allocation for compacted acceleration structure fails
-          // Can just use the uncompacted one then, so succeed here regardless
-          return true;
-
-        check_result_optix_ret(optixAccelCompact(context,
-                                                 NULL,
-                                                 out_handle,
-                                                 compacted_data.device_pointer,
-                                                 compacted_size,
-                                                 &out_handle));
-        bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-        // Wait for compaction to finish
-        check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-        std::swap(out_data.device_size, compacted_data.device_size);
-        std::swap(out_data.device_pointer, compacted_data.device_pointer);
-        // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
-      }
-    }
-
-    return true;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
-      /* For baking CUDA is used, build appropriate BVH for that. */
-      Device::build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    free_bvh_memory_delayed();
-
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    progress.set_substatus("Building OptiX acceleration structure");
-
-    if (!bvh->params.top_level) {
-      assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
-
-      OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
-      /* Refit is only possible when using fast to trace BVH (because AS is built with
-       * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
-      if (refit && !use_fast_trace_bvh) {
-        assert(bvh_optix->traversable_handle != 0);
-        operation = OPTIX_BUILD_OPERATION_UPDATE;
-      }
-      else {
-        bvh_optix->as_data.free();
-        bvh_optix->traversable_handle = 0;
-      }
-
-      // Build bottom level acceleration structures (BLAS)
-      Geometry *const geom = bvh->geometry[0];
-      if (geom->geometry_type == Geometry::HAIR) {
-        // Build BLAS for curve primitives
-        Hair *const hair = static_cast<Hair *const>(geom);
-        if (hair->num_curves() == 0) {
-          return;
-        }
-
-        const size_t num_segments = hair->num_segments();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = hair->get_motion_steps();
-        }
-
-        device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
-#  if OPTIX_ABI_VERSION >= 36
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        // Four control points for each curve segment
-        const size_t num_vertices = num_segments * 4;
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          index_data.alloc(num_segments);
-          vertex_data.alloc(num_vertices * num_motion_steps);
-        }
-        else
-#  endif
-          aabb_data.alloc(num_segments * num_motion_steps);
-
-        // Get AABBs for each motion step
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          // The center step for motion vertices is not stored in the attribute
-          const float3 *keys = hair->get_curve_keys().data();
-          size_t center_step = (num_motion_steps - 1) / 2;
-          if (step != center_step) {
-            size_t attr_offset = (step > center_step) ? step - 1 : step;
-            // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
-            keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
-          }
-
-          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
-            const Hair::Curve curve = hair->get_curve(j);
-#  if OPTIX_ABI_VERSION >= 36
-            const array<float> &curve_radius = hair->get_curve_radius();
-#  endif
-
-            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
-#  if OPTIX_ABI_VERSION >= 36
-              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-                int k0 = curve.first_key + segment;
-                int k1 = k0 + 1;
-                int ka = max(k0 - 1, curve.first_key);
-                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
-
-                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
-                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
-                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
-                const float4 pw = make_float4(
-                    curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
-
-                // Convert Catmull-Rom data to Bezier spline
-                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
-                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
-                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
-                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
-
-                index_data[i] = i * 4;
-                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
-                v[0] = make_float4(
-                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
-                v[1] = make_float4(
-                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
-                v[2] = make_float4(
-                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
-                v[3] = make_float4(
-                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
-              }
-              else
-#  endif
-              {
-                BoundBox bounds = BoundBox::empty;
-                curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
-
-                const size_t index = step * num_segments + i;
-                aabb_data[index].minX = bounds.min.x;
-                aabb_data[index].minY = bounds.min.y;
-                aabb_data[index].minZ = bounds.min.z;
-                aabb_data[index].maxX = bounds.max.x;
-                aabb_data[index].maxY = bounds.max.y;
-                aabb_data[index].maxZ = bounds.max.z;
-              }
-            }
-          }
-        }
-
-        // Upload AABB data to GPU
-        aabb_data.copy_to_device();
-#  if OPTIX_ABI_VERSION >= 36
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-#  endif
-
-        vector<device_ptr> aabb_ptrs;
-        aabb_ptrs.reserve(num_motion_steps);
-#  if OPTIX_ABI_VERSION >= 36
-        vector<device_ptr> width_ptrs;
-        vector<device_ptr> vertex_ptrs;
-        width_ptrs.reserve(num_motion_steps);
-        vertex_ptrs.reserve(num_motion_steps);
-#  endif
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
-#  if OPTIX_ABI_VERSION >= 36
-          const device_ptr base_ptr = vertex_data.device_pointer +
-                                      step * num_vertices * sizeof(float4);
-          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
-          vertex_ptrs.push_back(base_ptr);
-#  endif
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-#  if OPTIX_ABI_VERSION >= 36
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
-          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-          build_input.curveArray.numPrimitives = num_segments;
-          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-          build_input.curveArray.numVertices = num_vertices;
-          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
-          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
-          build_input.curveArray.widthStrideInBytes = sizeof(float4);
-          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
-          build_input.curveArray.indexStrideInBytes = sizeof(int);
-          build_input.curveArray.flag = build_flags;
-          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
-        }
-        else
-#  endif
-        {
-          // Disable visibility test any-hit program, since it is already checked during
-          // intersection. Those trace calls that require anyhit can force it with a ray flag.
-          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
-
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-#  if OPTIX_ABI_VERSION < 23
-          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.aabbArray.numPrimitives = num_segments;
-          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-          build_input.aabbArray.flags = &build_flags;
-          build_input.aabbArray.numSbtRecords = 1;
-          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  else
-          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.customPrimitiveArray.numPrimitives = num_segments;
-          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
-          build_input.customPrimitiveArray.flags = &build_flags;
-          build_input.customPrimitiveArray.numSbtRecords = 1;
-          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  endif
-        }
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-      else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
-        // Build BLAS for triangle primitives
-        Mesh *const mesh = static_cast<Mesh *const>(geom);
-        if (mesh->num_triangles() == 0) {
-          return;
-        }
-
-        const size_t num_verts = mesh->get_verts().size();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = mesh->get_motion_steps();
-        }
-
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        index_data.alloc(mesh->get_triangles().size());
-        memcpy(index_data.data(),
-               mesh->get_triangles().data(),
-               mesh->get_triangles().size() * sizeof(int));
-        device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        vertex_data.alloc(num_verts * num_motion_steps);
-
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          const float3 *verts = mesh->get_verts().data();
-
-          size_t center_step = (num_motion_steps - 1) / 2;
-          // The center step for motion vertices is not stored in the attribute
-          if (step != center_step) {
-            verts = motion_keys->data_float3() +
-                    (step > center_step ? step - 1 : step) * num_verts;
-          }
-
-          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
-        }
-
-        // Upload triangle data to GPU
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-
-        vector<device_ptr> vertex_ptrs;
-        vertex_ptrs.reserve(num_motion_steps);
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
-        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-        build_input.triangleArray.numVertices = num_verts;
-        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
-        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
-        build_input.triangleArray.indexBuffer = index_data.device_pointer;
-        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
-        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
-        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
-        build_input.triangleArray.flags = &build_flags;
-        // The SBT does not store per primitive data since Cycles already allocates separate
-        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
-        // one and rely on that having the same meaning in this case.
-        build_input.triangleArray.numSbtRecords = 1;
-        build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-    }
-    else {
-      unsigned int num_instances = 0;
-      unsigned int max_num_instances = 0xFFFFFFFF;
-
-      bvh_optix->as_data.free();
-      bvh_optix->traversable_handle = 0;
-      bvh_optix->motion_transform_data.free();
-
-      optixDeviceContextGetProperty(context,
-                                    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
-                                    &max_num_instances,
-                                    sizeof(max_num_instances));
-      // Do not count first bit, which is used to distinguish instanced and non-instanced objects
-      max_num_instances >>= 1;
-      if (bvh->objects.size() > max_num_instances) {
-        progress.set_error(
-            "Failed to build OptiX acceleration structure because there are too many instances");
-        return;
-      }
-
-      // Fill instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
-      aabbs.alloc(bvh->objects.size());
-#  endif
-      device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
-      instances.alloc(bvh->objects.size());
-
-      // Calculate total motion transform size and allocate memory for them
-      size_t motion_transform_offset = 0;
-      if (motion_blur) {
-        size_t total_motion_transform_size = 0;
-        for (Object *const ob : bvh->objects) {
-          if (ob->is_traceable() && ob->use_motion()) {
-            total_motion_transform_size = align_up(total_motion_transform_size,
-                                                   OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-            const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-            total_motion_transform_size = total_motion_transform_size +
-                                          sizeof(OptixSRTMotionTransform) +
-                                          motion_keys * sizeof(OptixSRTData);
-          }
-        }
-
-        assert(bvh_optix->motion_transform_data.device == this);
-        bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
-      }
-
-      for (Object *ob : bvh->objects) {
-        // Skip non-traceable objects
-        if (!ob->is_traceable())
-          continue;
-
-        BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
-        OptixTraversableHandle handle = blas->traversable_handle;
-
-#  if OPTIX_ABI_VERSION < 41
-        OptixAabb &aabb = aabbs[num_instances];
-        aabb.minX = ob->bounds.min.x;
-        aabb.minY = ob->bounds.min.y;
-        aabb.minZ = ob->bounds.min.z;
-        aabb.maxX = ob->bounds.max.x;
-        aabb.maxY = ob->bounds.max.y;
-        aabb.maxZ = ob->bounds.max.z;
-#  endif
-
-        OptixInstance &instance = instances[num_instances++];
-        memset(&instance, 0, sizeof(instance));
-
-        // Clear transform to identity matrix
-        instance.transform[0] = 1.0f;
-        instance.transform[5] = 1.0f;
-        instance.transform[10] = 1.0f;
-
-        // Set user instance ID to object index (but leave low bit blank)
-        instance.instanceId = ob->get_device_index() << 1;
-
-        // Have to have at least one bit in the mask, or else instance would always be culled
-        instance.visibilityMask = 1;
-
-        if (ob->get_geometry()->has_volume) {
-          // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-          instance.visibilityMask |= 2;
-        }
-
-        if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-          // Same applies to curves (so they can be skipped in local trace calls)
-          instance.visibilityMask |= 4;
-
-#  if OPTIX_ABI_VERSION >= 36
-          if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-              DebugFlags().optix.curves_api &&
-              static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-            // Select between motion blur and non-motion blur built-in intersection module
-            instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
-          }
-#  endif
-        }
-
-        // Insert motion traversable if object has motion
-        if (motion_blur && ob->use_motion()) {
-          size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                         motion_keys * sizeof(OptixSRTData);
-
-          const CUDAContextScope scope(cuContext);
-
-          motion_transform_offset = align_up(motion_transform_offset,
-                                             OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-          CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
-                                             motion_transform_offset;
-          motion_transform_offset += motion_transform_size;
-
-          // Allocate host side memory for motion transform and fill it with transform data
-          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              new uint8_t[motion_transform_size]);
-          motion_transform.child = handle;
-          motion_transform.motionOptions.numKeys = ob->get_motion().size();
-          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-          motion_transform.motionOptions.timeBegin = 0.0f;
-          motion_transform.motionOptions.timeEnd = 1.0f;
-
-          OptixSRTData *const srt_data = motion_transform.srtData;
-          array<DecomposedTransform> decomp(ob->get_motion().size());
-          transform_motion_decompose(
-              decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
-          for (size_t i = 0; i < ob->get_motion().size(); ++i) {
-            // Scale
-            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-            // Shear
-            srt_data[i].a = decomp[i].z.x;  // scale.x.y
-            srt_data[i].b = decomp[i].z.y;  // scale.x.z
-            srt_data[i].c = decomp[i].w.x;  // scale.y.z
-            assert(decomp[i].z.z == 0.0f);  // scale.y.x
-            assert(decomp[i].w.y == 0.0f);  // scale.z.x
-            assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-            // Pivot point
-            srt_data[i].pvx = 0.0f;
-            srt_data[i].pvy = 0.0f;
-            srt_data[i].pvz = 0.0f;
-
-            // Rotation
-            srt_data[i].qx = decomp[i].x.x;
-            srt_data[i].qy = decomp[i].x.y;
-            srt_data[i].qz = decomp[i].x.z;
-            srt_data[i].qw = decomp[i].x.w;
-
-            // Translation
-            srt_data[i].tx = decomp[i].y.x;
-            srt_data[i].ty = decomp[i].y.y;
-            srt_data[i].tz = decomp[i].y.z;
-          }
-
-          // Upload motion transform to GPU
-          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
-
-          // Disable instance transform if object uses motion transform already
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
-          // Get traversable handle to motion transform
-          optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu,
-                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                                 &instance.traversableHandle);
-        }
-        else {
-          instance.traversableHandle = handle;
-
-          if (ob->get_geometry()->is_instanced()) {
-            // Set transform matrix
-            memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
-          }
-          else {
-            // Disable instance transform if geometry already has it applied to vertex data
-            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-            // Non-instanced objects read ID from 'prim_object', so distinguish
-            // them from instanced objects with the low bit set
-            instance.instanceId |= 1;
-          }
-        }
-      }
-
-      // Upload instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      aabbs.resize(num_instances);
-      aabbs.copy_to_device();
-#  endif
-      instances.resize(num_instances);
-      instances.copy_to_device();
-
-      // Build top-level acceleration structure (TLAS)
-      OptixBuildInput build_input = {};
-      build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
-#  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
-      build_input.instanceArray.aabbs = aabbs.device_pointer;
-      build_input.instanceArray.numAabbs = num_instances;
-#  endif
-      build_input.instanceArray.instances = instances.device_pointer;
-      build_input.instanceArray.numInstances = num_instances;
-
-      if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
-        progress.set_error("Failed to build OptiX acceleration structure");
-      }
-      tlas_handle = bvh_optix->traversable_handle;
-    }
-  }
-
-  void release_optix_bvh(BVH *bvh) override
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
-     * while GPU is still rendering. */
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
-    bvh_optix->traversable_handle = 0;
-  }
-
-  void free_bvh_memory_delayed()
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    delayed_free_bvh_memory.free_memory();
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    // Set constant memory for CUDA module
-    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
-    //               Could be removed by moving those functions to filter CUDA module.
-    CUDADevice::const_copy_to(name, host, size);
-
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update traversable handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
-      update_launch_params(offsetof(KernelParams, data), host, size);
-      return;
-    }
-
-    // Update data storage pointers in launch parameters
-#  define KERNEL_TEX(data_type, tex_name) \
-    if (strcmp(name, #tex_name) == 0) { \
-      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
-      return; \
-    }
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  }
-
-  void update_launch_params(size_t offset, void *data, size_t data_size)
-  {
-    const CUDAContextScope scope(cuContext);
-
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(
-          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
-                       data,
-                       data_size));
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    // Upload texture information to device if it has changed since last launch
-    load_texture_info();
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      // Execute in main thread because of OpenGL access
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-      return;
-    }
-
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy, 0);
-      });
-      return;
-    }
-
-    // Split task into smaller ones
-    list<DeviceTask> tasks;
-    task.split(tasks, info.cpu_threads);
-
-    // Queue tasks in internal task pool
-    int task_index = 0;
-    for (DeviceTask &task : tasks) {
-      task_pool.push([=] {
-        // Using task index parameter instead of thread index, since number of CUDA streams may
-        // differ from number of threads
-        DeviceTask task_copy = task;
-        thread_run(task_copy, task_index);
-      });
-      task_index++;
-    }
-  }
-
-  void task_wait() override
-  {
-    // Wait for all queued tasks to finish
-    task_pool.wait_work();
-  }
-
-  void task_cancel() override
-  {
-    // Cancel any remaining tasks in the internal pool
-    task_pool.cancel();
-  }
-};
-
-bool device_optix_init()
-{
-  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
-    return true;  // Already initialized function table
-
-  // Need to initialize CUDA as well
-  if (!device_cuda_init())
-    return false;
-
-  const OptixResult result = optixInit();
-
-  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
-    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
-               "Please update to the latest driver first!";
-    return false;
-  }
-  else if (result != OPTIX_SUCCESS) {
-    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
-    return false;
-  }
-
-  // Loaded OptiX successfully!
-  return true;
-}
-
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
-{
-  devices.reserve(cuda_devices.size());
-
-  // Simply add all supported CUDA devices as OptiX devices again
-  for (DeviceInfo info : cuda_devices) {
-    assert(info.type == DEVICE_CUDA);
-
-    int major;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
-    if (major < 5) {
-      continue;  // Only Maxwell and up are supported by OptiX
-    }
-
-    info.type = DEVICE_OPTIX;
-    info.id += "_OptiX";
-    info.denoisers |= DENOISER_OPTIX;
-    info.has_branched_path = false;
-
-    devices.push_back(info);
-  }
-}
-
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new OptiXDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
new file mode 100644
index 00000000000..a89ba68d62c
--- /dev/null
+++ b/intern/cycles/device/device_queue.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_queue.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
+
+CCL_NAMESPACE_BEGIN
+
+DeviceQueue::DeviceQueue(Device *device)
+    : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+{
+  DCHECK_NE(device, nullptr);
+}
+
+DeviceQueue::~DeviceQueue()
+{
+  if (VLOG_IS_ON(3)) {
+    /* Print kernel execution times sorted by time. */
+    vector<pair<DeviceKernelMask, double>> stats_sorted;
+    for (const auto &stat : stats_kernel_time_) {
+      stats_sorted.push_back(stat);
+    }
+
+    sort(stats_sorted.begin(),
+         stats_sorted.end(),
+         [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+           return a.second > b.second;
+         });
+
+    VLOG(3) << "GPU queue stats:";
+    for (const auto &[mask, time] : stats_sorted) {
+      VLOG(3) << "  " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+              << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+    }
+  }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+  if (VLOG_IS_ON(3)) {
+    last_sync_time_ = time_dt();
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+  if (VLOG_IS_ON(3)) {
+    VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+            << work_size;
+    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+  }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+  if (VLOG_IS_ON(3)) {
+    const double new_time = time_dt();
+    const double elapsed_time = new_time - last_sync_time_;
+    VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+    last_sync_time_ = new_time;
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
new file mode 100644
index 00000000000..edda3e61d51
--- /dev/null
+++ b/intern/cycles/device/device_queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_kernel.h"
+
+#include "device/device_graphics_interop.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class device_memory;
+
+struct KernelWorkTile;
+
+/* Abstraction of a command queue for a device.
+ * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
+ * from driver side.
+ *
+ * This class encapsulates all properties needed for commands execution. */
+class DeviceQueue {
+ public:
+  virtual ~DeviceQueue();
+
+  /* Number of concurrent states to process for integrator,
+   * based on number of cores and/or available memory. */
+  virtual int num_concurrent_states(const size_t state_size) const = 0;
+
+  /* Number of states which keeps the device occupied with work without loosing performance.
+   * The renderer will add more work (when available) when number of active paths falls below this
+   * value. */
+  virtual int num_concurrent_busy_states() const = 0;
+
+  /* Initialize execution of kernels on this queue.
+   *
+   * Will, for example, load all data required by the kernels from Device to global or path state.
+   *
+   * Use this method after device synchronization has finished before enqueueing any kernels. */
+  virtual void init_execution() = 0;
+
+  /* Test if an optional device kernel is available. */
+  virtual bool kernel_available(DeviceKernel kernel) const = 0;
+
+  /* Enqueue kernel execution.
+   *
+   * Execute the kernel work_size times on the device.
+   * Supported arguments types:
+   * - int: pass pointer to the int
+   * - device memory: pass pointer to device_memory.device_pointer
+   * Return false if there was an error executing this or a previous kernel. */
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
+
+  /* Wait unit all enqueued kernels have finished execution.
+   * Return false if there was an error executing any of the enqueued kernels. */
+  virtual bool synchronize() = 0;
+
+  /* Copy memory to/from device as part of the command queue, to ensure
+   * operations are done in order without having to synchronize. */
+  virtual void zero_to_device(device_memory &mem) = 0;
+  virtual void copy_to_device(device_memory &mem) = 0;
+  virtual void copy_from_device(device_memory &mem) = 0;
+
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Create graphics interoperability context which will be taking care of mapping graphics
+   * resource as a buffer writable by kernels of this device. */
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
+  {
+    LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Device this queue has been created for. */
+  Device *device;
+
+ protected:
+  /* Hide construction so that allocation via `Device` API is enforced. */
+  explicit DeviceQueue(Device *device);
+
+  /* Implementations call these from the corresponding methods to generate debugging logs. */
+  void debug_init_execution();
+  void debug_enqueue(DeviceKernel kernel, const int work_size);
+  void debug_synchronize();
+
+  /* Combination of kernels enqueued together sync last synchronize. */
+  DeviceKernelMask last_kernels_enqueued_;
+  /* Time of synchronize call. */
+  double last_sync_time_;
+  /* Accumulated execution time for combinations of kernels launched together. */
+  map<DeviceKernelMask, double> stats_kernel_time_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
deleted file mode 100644
index 9889f688aaa..00000000000
--- a/intern/cycles/device/device_split_kernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_split_kernel.h"
-
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "util/util_logging.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device)
-    : device(device),
-      split_data(device, "split_data"),
-      ray_state(device, "ray_state", MEM_READ_WRITE),
-      queue_index(device, "queue_index"),
-      use_queues_flag(device, "use_queues_flag"),
-      work_pool_wgs(device, "work_pool_wgs"),
-      kernel_data_initialized(false)
-{
-  avg_time_per_sample = 0.0;
-
-  kernel_path_init = NULL;
-  kernel_scene_intersect = NULL;
-  kernel_lamp_emission = NULL;
-  kernel_do_volume = NULL;
-  kernel_queue_enqueue = NULL;
-  kernel_indirect_background = NULL;
-  kernel_shader_setup = NULL;
-  kernel_shader_sort = NULL;
-  kernel_shader_eval = NULL;
-  kernel_holdout_emission_blurring_pathtermination_ao = NULL;
-  kernel_subsurface_scatter = NULL;
-  kernel_direct_lighting = NULL;
-  kernel_shadow_blocked_ao = NULL;
-  kernel_shadow_blocked_dl = NULL;
-  kernel_enqueue_inactive = NULL;
-  kernel_next_iteration_setup = NULL;
-  kernel_indirect_subsurface = NULL;
-  kernel_buffer_update = NULL;
-  kernel_adaptive_stopping = NULL;
-  kernel_adaptive_filter_x = NULL;
-  kernel_adaptive_filter_y = NULL;
-  kernel_adaptive_adjust_samples = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
-  split_data.free();
-  ray_state.free();
-  use_queues_flag.free();
-  queue_index.free();
-  work_pool_wgs.free();
-
-  delete kernel_path_init;
-  delete kernel_scene_intersect;
-  delete kernel_lamp_emission;
-  delete kernel_do_volume;
-  delete kernel_queue_enqueue;
-  delete kernel_indirect_background;
-  delete kernel_shader_setup;
-  delete kernel_shader_sort;
-  delete kernel_shader_eval;
-  delete kernel_holdout_emission_blurring_pathtermination_ao;
-  delete kernel_subsurface_scatter;
-  delete kernel_direct_lighting;
-  delete kernel_shadow_blocked_ao;
-  delete kernel_shadow_blocked_dl;
-  delete kernel_enqueue_inactive;
-  delete kernel_next_iteration_setup;
-  delete kernel_indirect_subsurface;
-  delete kernel_buffer_update;
-  delete kernel_adaptive_stopping;
-  delete kernel_adaptive_filter_x;
-  delete kernel_adaptive_filter_y;
-  delete kernel_adaptive_adjust_samples;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-#define LOAD_KERNEL(name) \
-  kernel_##name = get_split_kernel_function(#name, requested_features); \
-  if (!kernel_##name) { \
-    device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
-    return false; \
-  }
-
-  LOAD_KERNEL(path_init);
-  LOAD_KERNEL(scene_intersect);
-  LOAD_KERNEL(lamp_emission);
-  if (requested_features.use_volume) {
-    LOAD_KERNEL(do_volume);
-  }
-  LOAD_KERNEL(queue_enqueue);
-  LOAD_KERNEL(indirect_background);
-  LOAD_KERNEL(shader_setup);
-  LOAD_KERNEL(shader_sort);
-  LOAD_KERNEL(shader_eval);
-  LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-  LOAD_KERNEL(subsurface_scatter);
-  LOAD_KERNEL(direct_lighting);
-  LOAD_KERNEL(shadow_blocked_ao);
-  LOAD_KERNEL(shadow_blocked_dl);
-  LOAD_KERNEL(enqueue_inactive);
-  LOAD_KERNEL(next_iteration_setup);
-  LOAD_KERNEL(indirect_subsurface);
-  LOAD_KERNEL(buffer_update);
-  LOAD_KERNEL(adaptive_stopping);
-  LOAD_KERNEL(adaptive_filter_x);
-  LOAD_KERNEL(adaptive_filter_y);
-  LOAD_KERNEL(adaptive_adjust_samples);
-
-#undef LOAD_KERNEL
-
-  /* Re-initialiaze kernel-dependent data when kernels change. */
-  kernel_data_initialized = false;
-
-  return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
-                                                           device_memory &data,
-                                                           uint64_t max_buffer_size)
-{
-  uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
-  VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
-          << " bytes. (" << string_human_readable_size(size_per_element) << ").";
-  return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask &task,
-                                   RenderTile &tile,
-                                   device_memory &kgbuffer,
-                                   device_memory &kernel_data)
-{
-  if (device->have_error()) {
-    return false;
-  }
-
-  /* Allocate all required global memory once. */
-  if (!kernel_data_initialized) {
-    kernel_data_initialized = true;
-
-    /* Set local size */
-    int2 lsize = split_kernel_local_size();
-    local_size[0] = lsize[0];
-    local_size[1] = lsize[1];
-
-    /* Set global size */
-    int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
-    /* Make sure that set work size is a multiple of local
-     * work size dimensions.
-     */
-    global_size[0] = round_up(gsize[0], local_size[0]);
-    global_size[1] = round_up(gsize[1], local_size[1]);
-
-    int num_global_elements = global_size[0] * global_size[1];
-    assert(num_global_elements % WORK_POOL_SIZE == 0);
-
-    /* Calculate max groups */
-
-    /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
-    unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
-                                                                      WORK_POOL_SIZE_GPU;
-    unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
-
-    /* Allocate work_pool_wgs memory. */
-    work_pool_wgs.alloc_to_device(max_work_groups);
-    queue_index.alloc_to_device(NUM_QUEUES);
-    use_queues_flag.alloc_to_device(1);
-    split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
-    ray_state.alloc(num_global_elements);
-  }
-
-  /* Number of elements in the global state buffer */
-  int num_global_elements = global_size[0] * global_size[1];
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-  if (device->have_error()) { \
-    return false; \
-  } \
-  if (!kernel_##name->enqueue( \
-          KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
-    return false; \
-  }
-
-  tile.sample = tile.start_sample;
-
-  /* for exponential increase between tile updates */
-  int time_multiplier = 1;
-
-  while (tile.sample < tile.start_sample + tile.num_samples) {
-    /* to keep track of how long it takes to run a number of samples */
-    double start_time = time_dt();
-
-    /* initial guess to start rolling average */
-    const int initial_num_samples = 1;
-    /* approx number of samples per second */
-    const int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                       initial_num_samples;
-
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.sample;
-    subtile.num_samples = samples_per_second;
-
-    if (task.adaptive_sampling.use) {
-      subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
-                                                                 subtile.num_samples);
-    }
-
-    /* Don't go beyond requested number of samples. */
-    subtile.num_samples = min(subtile.num_samples,
-                              tile.start_sample + tile.num_samples - tile.sample);
-
-    if (device->have_error()) {
-      return false;
-    }
-
-    /* reset state memory here as global size for data_init
-     * kernel might not be large enough to do in kernel
-     */
-    work_pool_wgs.zero_to_device();
-    split_data.zero_to_device();
-    ray_state.zero_to_device();
-
-    if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                        subtile,
-                                        num_global_elements,
-                                        kgbuffer,
-                                        kernel_data,
-                                        split_data,
-                                        ray_state,
-                                        queue_index,
-                                        use_queues_flag,
-                                        work_pool_wgs)) {
-      return false;
-    }
-
-    ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
-    bool activeRaysAvailable = true;
-    double cancel_time = DBL_MAX;
-
-    while (activeRaysAvailable) {
-      /* Do path-iteration in host [Enqueue Path-iteration kernels. */
-      for (int PathIter = 0; PathIter < 16; PathIter++) {
-        ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-        if (kernel_do_volume) {
-          ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
-        }
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(
-            holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
-        if (task.get_cancel() && cancel_time == DBL_MAX) {
-          /* Wait up to twice as many seconds for current samples to finish
-           * to avoid artifacts in render result from ending too soon.
-           */
-          cancel_time = time_dt() + 2.0 * time_multiplier;
-        }
-
-        if (time_dt() > cancel_time) {
-          return true;
-        }
-      }
-
-      /* Decide if we should exit path-iteration in host. */
-      ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
-      activeRaysAvailable = false;
-
-      for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-        if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
-          if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
-            /* Something went wrong, abort to avoid looping endlessly. */
-            device->set_error("Split kernel error: invalid ray state");
-            return false;
-          }
-
-          /* Not all rays are RAY_INACTIVE. */
-          activeRaysAvailable = true;
-          break;
-        }
-      }
-
-      if (time_dt() > cancel_time) {
-        return true;
-      }
-    }
-
-    int filter_sample = tile.sample + subtile.num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      size_t buffer_size[2];
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(tile.h, local_size[1]);
-      kernel_adaptive_stopping->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.h, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_x->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_y->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-    }
-
-    double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
-
-    if (avg_time_per_sample == 0.0) {
-      /* start rolling average */
-      avg_time_per_sample = time_per_sample;
-    }
-    else {
-      avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
-    }
-
-#undef ENQUEUE_SPLIT_KERNEL
-
-    tile.sample += subtile.num_samples;
-    task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
-
-    time_multiplier = min(time_multiplier << 1, 10);
-
-    if (task.get_cancel()) {
-      return true;
-    }
-  }
-
-  if (task.adaptive_sampling.use) {
-    /* Reset the start samples. */
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.start_sample;
-    subtile.num_samples = tile.sample - tile.start_sample;
-    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                   subtile,
-                                   num_global_elements,
-                                   kgbuffer,
-                                   kernel_data,
-                                   split_data,
-                                   ray_state,
-                                   queue_index,
-                                   use_queues_flag,
-                                   work_pool_wgs);
-    size_t buffer_size[2];
-    buffer_size[0] = round_up(tile.w, local_size[0]);
-    buffer_size[1] = round_up(tile.h, local_size[1]);
-    kernel_adaptive_adjust_samples->enqueue(
-        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-  }
-
-  return true;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
deleted file mode 100644
index 07a21b10299..00000000000
--- a/intern/cycles/device/device_split_kernel.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device/device.h"
-#include "render/buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000  // 5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
- public:
-  size_t global_size[2];
-  size_t local_size[2];
-
-  KernelDimensions(size_t global_size_[2], size_t local_size_[2])
-  {
-    memcpy(global_size, global_size_, sizeof(global_size));
-    memcpy(local_size, local_size_, sizeof(local_size));
-  }
-};
-
-class SplitKernelFunction {
- public:
-  virtual ~SplitKernelFunction()
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
-};
-
-class DeviceSplitKernel {
- private:
-  Device *device;
-
-  SplitKernelFunction *kernel_path_init;
-  SplitKernelFunction *kernel_scene_intersect;
-  SplitKernelFunction *kernel_lamp_emission;
-  SplitKernelFunction *kernel_do_volume;
-  SplitKernelFunction *kernel_queue_enqueue;
-  SplitKernelFunction *kernel_indirect_background;
-  SplitKernelFunction *kernel_shader_setup;
-  SplitKernelFunction *kernel_shader_sort;
-  SplitKernelFunction *kernel_shader_eval;
-  SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
-  SplitKernelFunction *kernel_subsurface_scatter;
-  SplitKernelFunction *kernel_direct_lighting;
-  SplitKernelFunction *kernel_shadow_blocked_ao;
-  SplitKernelFunction *kernel_shadow_blocked_dl;
-  SplitKernelFunction *kernel_enqueue_inactive;
-  SplitKernelFunction *kernel_next_iteration_setup;
-  SplitKernelFunction *kernel_indirect_subsurface;
-  SplitKernelFunction *kernel_buffer_update;
-  SplitKernelFunction *kernel_adaptive_stopping;
-  SplitKernelFunction *kernel_adaptive_filter_x;
-  SplitKernelFunction *kernel_adaptive_filter_y;
-  SplitKernelFunction *kernel_adaptive_adjust_samples;
-
-  /* Global memory variables [porting]; These memory is used for
-   * co-operation between different kernels; Data written by one
-   * kernel will be available to another kernel via this global
-   * memory.
-   */
-  device_only_memory<uchar> split_data;
-  device_vector<uchar> ray_state;
-  device_only_memory<int>
-      queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
-  /* Flag to make sceneintersect and lampemission kernel use queues. */
-  device_only_memory<char> use_queues_flag;
-
-  /* Approximate time it takes to complete one sample */
-  double avg_time_per_sample;
-
-  /* Work pool with respect to each work group. */
-  device_only_memory<unsigned int> work_pool_wgs;
-
-  /* Cached kernel-dependent data, initialized once. */
-  bool kernel_data_initialized;
-  size_t local_size[2];
-  size_t global_size[2];
-
- public:
-  explicit DeviceSplitKernel(Device *device);
-  virtual ~DeviceSplitKernel();
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  bool path_trace(DeviceTask &task,
-                  RenderTile &rtile,
-                  device_memory &kgbuffer,
-                  device_memory &kernel_data);
-
-  virtual uint64_t state_buffer_size(device_memory &kg,
-                                     device_memory &data,
-                                     size_t num_threads) = 0;
-  size_t max_elements_for_max_buffer_size(device_memory &kg,
-                                          device_memory &data,
-                                          uint64_t max_buffer_size);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs) = 0;
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &) = 0;
-  virtual int2 split_kernel_local_size() = 0;
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask &task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
deleted file mode 100644
index 55fbaa31e42..00000000000
--- a/intern/cycles/device/device_task.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "device/device_task.h"
-
-#include "render/buffers.h"
-
-#include "util/util_algorithm.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-DeviceTask::DeviceTask(Type type_)
-    : type(type_),
-      x(0),
-      y(0),
-      w(0),
-      h(0),
-      rgba_byte(0),
-      rgba_half(0),
-      buffer(0),
-      sample(0),
-      num_samples(1),
-      shader_input(0),
-      shader_output(0),
-      shader_eval_type(0),
-      shader_filter(0),
-      shader_x(0),
-      shader_w(0),
-      buffers(nullptr),
-      tile_types(0),
-      denoising_from_render(false),
-      pass_stride(0),
-      frame_stride(0),
-      target_pass_stride(0),
-      pass_denoising_data(0),
-      pass_denoising_clean(0),
-      need_finish_queue(false),
-      integrator_branched(false)
-{
-  last_update_time = time_dt();
-}
-
-int DeviceTask::get_subtask_count(int num, int max_size) const
-{
-  if (max_size != 0) {
-    int max_size_num;
-
-    if (type == SHADER) {
-      max_size_num = (shader_w + max_size - 1) / max_size;
-    }
-    else {
-      max_size = max(1, max_size / w);
-      max_size_num = (h + max_size - 1) / max_size;
-    }
-
-    num = max(max_size_num, num);
-  }
-
-  if (type == SHADER) {
-    num = min(shader_w, num);
-  }
-  else if (type == RENDER) {
-  }
-  else {
-    num = min(h, num);
-  }
-
-  return num;
-}
-
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
-{
-  num = get_subtask_count(num, max_size);
-
-  if (type == SHADER) {
-    for (int i = 0; i < num; i++) {
-      int tx = shader_x + (shader_w / num) * i;
-      int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
-
-      DeviceTask task = *this;
-
-      task.shader_x = tx;
-      task.shader_w = tw;
-
-      tasks.push_back(task);
-    }
-  }
-  else if (type == RENDER) {
-    for (int i = 0; i < num; i++)
-      tasks.push_back(*this);
-  }
-  else {
-    for (int i = 0; i < num; i++) {
-      int ty = y + (h / num) * i;
-      int th = (i == num - 1) ? h - i * (h / num) : h / num;
-
-      DeviceTask task = *this;
-
-      task.y = ty;
-      task.h = th;
-
-      tasks.push_back(task);
-    }
-  }
-}
-
-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
-{
-  if (type == FILM_CONVERT)
-    return;
-
-  if (update_progress_sample) {
-    if (pixel_samples == -1) {
-      pixel_samples = shader_w;
-    }
-    update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
-  }
-
-  if (update_tile_sample) {
-    double current_time = time_dt();
-
-    if (current_time - last_update_time >= 1.0) {
-      update_tile_sample(*rtile);
-
-      last_update_time = current_time;
-    }
-  }
-}
-
-/* Adaptive Sampling */
-
-AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
-{
-}
-
-/* Render samples in steps that align with the adaptive filtering. */
-int AdaptiveSampling::align_samples(int sample, int num_samples) const
-{
-  int end_sample = sample + num_samples;
-
-  /* Round down end sample to the nearest sample that needs filtering. */
-  end_sample &= ~(adaptive_step - 1);
-
-  if (end_sample <= sample) {
-    /* In order to reach the next sample that needs filtering, we'd need
-     * to increase num_samples. We don't do that in this function, so
-     * just keep it as is and don't filter this time around. */
-    return num_samples;
-  }
-  return end_sample - sample;
-}
-
-bool AdaptiveSampling::need_filter(int sample) const
-{
-  if (sample > min_samples) {
-    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
-  }
-  else {
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
deleted file mode 100644
index 3f7cf47b692..00000000000
--- a/intern/cycles/device/device_task.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_TASK_H__
-#define __DEVICE_TASK_H__
-
-#include "device/device_memory.h"
-
-#include "util/util_function.h"
-#include "util/util_list.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-class Device;
-class RenderBuffers;
-class RenderTile;
-class RenderTileNeighbors;
-class Tile;
-
-enum DenoiserType {
-  DENOISER_NLM = 1,
-  DENOISER_OPTIX = 2,
-  DENOISER_OPENIMAGEDENOISE = 4,
-  DENOISER_NUM,
-
-  DENOISER_NONE = 0,
-  DENOISER_ALL = ~0,
-};
-
-enum DenoiserInput {
-  DENOISER_INPUT_RGB = 1,
-  DENOISER_INPUT_RGB_ALBEDO = 2,
-  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
-
-  DENOISER_INPUT_NUM,
-};
-
-typedef int DenoiserTypeMask;
-
-class DenoiseParams {
- public:
-  /* Apply denoiser to image. */
-  bool use;
-  /* Output denoising data passes (possibly without applying the denoiser). */
-  bool store_passes;
-
-  /* Denoiser type. */
-  DenoiserType type;
-
-  /* Viewport start sample. */
-  int start_sample;
-
-  /** Native Denoiser. */
-
-  /* Pixel radius for neighboring pixels to take into account. */
-  int radius;
-  /* Controls neighbor pixel weighting for the denoising filter. */
-  float strength;
-  /* Preserve more or less detail based on feature passes. */
-  float feature_strength;
-  /* When removing pixels that don't carry information,
-   * use a relative threshold instead of an absolute one. */
-  bool relative_pca;
-  /* How many frames before and after the current center frame are included. */
-  int neighbor_frames;
-  /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
-  bool clamp_input;
-
-  /** OIDN/Optix Denoiser. */
-
-  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
-  DenoiserInput input_passes;
-
-  DenoiseParams()
-  {
-    use = false;
-    store_passes = false;
-
-    type = DENOISER_NLM;
-
-    radius = 8;
-    strength = 0.5f;
-    feature_strength = 0.5f;
-    relative_pca = false;
-    neighbor_frames = 2;
-    clamp_input = true;
-
-    /* Default to color + albedo only, since normal input does not always have the desired effect
-     * when denoising with OptiX. */
-    input_passes = DENOISER_INPUT_RGB_ALBEDO;
-
-    start_sample = 0;
-  }
-
-  /* Test if a denoising task needs to run, also to prefilter passes for the native
-   * denoiser when we are not applying denoising to the combined image. */
-  bool need_denoising_task() const
-  {
-    return (use || (store_passes && type == DENOISER_NLM));
-  }
-};
-
-class AdaptiveSampling {
- public:
-  AdaptiveSampling();
-
-  int align_samples(int sample, int num_samples) const;
-  bool need_filter(int sample) const;
-
-  bool use;
-  int adaptive_step;
-  int min_samples;
-};
-
-class DeviceTask {
- public:
-  typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
-  Type type;
-
-  int x, y, w, h;
-  device_ptr rgba_byte;
-  device_ptr rgba_half;
-  device_ptr buffer;
-  int sample;
-  int num_samples;
-  int offset, stride;
-
-  device_ptr shader_input;
-  device_ptr shader_output;
-  int shader_eval_type;
-  int shader_filter;
-  int shader_x, shader_w;
-
-  RenderBuffers *buffers;
-
-  explicit DeviceTask(Type type = RENDER);
-
-  int get_subtask_count(int num, int max_size = 0) const;
-  void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
-
-  void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
-  function<bool(Device *device, RenderTile &, uint)> acquire_tile;
-  function<void(long, int)> update_progress_sample;
-  function<void(RenderTile &)> update_tile_sample;
-  function<void(RenderTile &)> release_tile;
-  function<bool()> get_cancel;
-  function<bool()> get_tile_stolen;
-  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
-  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
-
-  uint tile_types;
-  DenoiseParams denoising;
-  bool denoising_from_render;
-  vector<int> denoising_frames;
-
-  int pass_stride;
-  int frame_stride;
-  int target_pass_stride;
-  int pass_denoising_data;
-  int pass_denoising_clean;
-
-  bool need_finish_queue;
-  bool integrator_branched;
-  AdaptiveSampling adaptive_sampling;
-
- protected:
-  double last_update_time;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp
index 5112fc152e5..678276ed025 100644
--- a/intern/cycles/device/device_dummy.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "device/dummy/device.h"
+
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN
 
 class DummyDevice : public Device {
  public:
-  DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_)
+  DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_)
   {
     error_msg = info.error_msg;
   }
@@ -61,23 +63,11 @@ class DummyDevice : public Device {
   virtual void const_copy_to(const char *, void *, size_t) override
   {
   }
-
-  virtual void task_add(DeviceTask &) override
-  {
-  }
-
-  virtual void task_wait() override
-  {
-  }
-
-  virtual void task_cancel() override
-  {
-  }
 };
 
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new DummyDevice(info, stats, profiler, background);
+  return new DummyDevice(info, stats, profiler);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/device/dummy/device.h
index 8afaa686e28..832a9568129 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/device/dummy/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_do_volume.h"
+#pragma once
 
-#define KERNEL_NAME do_volume
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
new file mode 100644
index 00000000000..6dbcce2d9a5
--- /dev/null
+++ b/intern/cycles/device/multi/device.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/multi/device.h"
+
+#include <sstream>
+#include <stdlib.h>
+
+#include "bvh/bvh_multi.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "render/buffers.h"
+#include "render/geometry.h"
+
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MultiDevice : public Device {
+ public:
+  struct SubDevice {
+    Stats stats;
+    Device *device;
+    map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
+  };
+
+  list<SubDevice> devices;
+  device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;
+
+  MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+      : Device(info, stats, profiler), unique_key(1)
+  {
+    foreach (const DeviceInfo &subinfo, info.multi_devices) {
+      /* Always add CPU devices at the back since GPU devices can change
+       * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
+      if (subinfo.type == DEVICE_CPU) {
+        devices.emplace_back();
+        sub = &devices.back();
+      }
+      else {
+        devices.emplace_front();
+        sub = &devices.front();
+      }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
+
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
+    }
+  }
+
+  ~MultiDevice()
+  {
+    foreach (SubDevice &sub, devices)
+      delete sub.device;
+  }
+
+  const string &error_message() override
+  {
+    error_msg.clear();
+
+    foreach (SubDevice &sub, devices)
+      error_msg += sub.device->error_message();
+
+    return error_msg;
+  }
+
+  virtual bool show_samples() const override
+  {
+    if (devices.size() > 1) {
+      return false;
+    }
+    return devices.front().device->show_samples();
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override
+  {
+    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
+    foreach (const SubDevice &sub_device, devices) {
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      bvh_layout_mask &= device_bvh_layout_mask;
+      bvh_layout_mask_all |= device_bvh_layout_mask;
+    }
+
+    /* With multiple OptiX devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+      return BVH_LAYOUT_MULTI_OPTIX;
+    }
+
+    /* When devices do not share a common BVH layout, fall back to creating one for each */
+    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+    }
+
+    return bvh_layout_mask;
+  }
+
+  bool load_kernels(const uint kernel_features) override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_kernels(kernel_features))
+        return false;
+
+    return true;
+  }
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
+  {
+    /* Try to build and share a single acceleration structure, if possible */
+    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+      devices.back().device->build_bvh(bvh, progress, refit);
+      return;
+    }
+
+    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
+
+    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+    bvh_multi->sub_bvhs.resize(devices.size());
+
+    vector<BVHMulti *> geom_bvhs;
+    geom_bvhs.reserve(bvh->geometry.size());
+    foreach (Geometry *geom, bvh->geometry) {
+      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+    }
+
+    /* Broadcast acceleration structure build to all render devices */
+    size_t i = 0;
+    foreach (SubDevice &sub, devices) {
+      /* Change geometry BVH pointers to the sub BVH */
+      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+      }
+
+      if (!bvh_multi->sub_bvhs[i]) {
+        BVHParams params = bvh->params;
+        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+                                                                      BVH_LAYOUT_EMBREE;
+
+        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+         * (since they are put into the top level directly, see bvh_embree.cpp) */
+        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+            !bvh->geometry[0]->is_instanced()) {
+          i++;
+          continue;
+        }
+
+        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+      }
+
+      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+      i++;
+    }
+
+    /* Change geometry BVH pointers back to the multi BVH. */
+    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+      bvh->geometry[k]->bvh = geom_bvhs[k];
+    }
+  }
+
+  virtual void *get_cpu_osl_memory() override
+  {
+    if (devices.size() > 1) {
+      return NULL;
+    }
+    return devices.front().device->get_cpu_osl_memory();
+  }
+
+  bool is_resident(device_ptr key, Device *sub_device) override
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
+  void mem_alloc(device_memory &mem) override
+  {
+    device_ptr key = unique_key++;
+
+    assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY);
+    /* The remaining memory types can be distributed across devices */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      owner_sub->device->mem_alloc(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void mem_copy_to(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_copy_to(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+
+      if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+        /* Need to create texture objects and update pointer in kernel globals on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_copy_to(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  {
+    device_ptr key = mem.device_pointer;
+    int i = 0, sub_h = h / devices.size();
+
+    foreach (SubDevice &sub, devices) {
+      int sy = y + i * sub_h;
+      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
+      i++;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+  }
+
+  void mem_zero(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_zero(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_free(device_memory &mem) override
+  {
+    device_ptr key = mem.device_pointer;
+    size_t existing_size = mem.device_size;
+
+    /* Free memory that was allocated for all devices (see above) on each device */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_free(mem);
+      owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+      if (mem.type == MEM_TEXTURE) {
+        /* Free texture objects on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_free(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+    stats.mem_free(existing_size);
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size) override
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->const_copy_to(name, host, size);
+  }
+
+  int device_number(Device *sub_device) override
+  {
+    int i = 0;
+
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
+    return -1;
+  }
+
+  virtual void foreach_device(const function<void(Device *)> &callback) override
+  {
+    foreach (SubDevice &sub, devices) {
+      sub.device->foreach_device(callback);
+    }
+  }
+};
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new MultiDevice(info, stats, profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/device/multi/device.h
index 192d01444ba..6e121014a1f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/device/multi/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_background.h"
+#pragma once
 
-#define KERNEL_NAME indirect_background
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
deleted file mode 100644
index a65e764b0d4..00000000000
--- a/intern/cycles/device/opencl/device_opencl.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-#  include "util/util_task.h"
-
-#  include "clew.h"
-
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Disable workarounds, seems to be working fine on latest drivers. */
-#  define CYCLES_DISABLE_DRIVER_WORKAROUNDS
-
-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */
-#  ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-#    undef clEnqueueNDRangeKernel
-#    define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueWriteBuffer
-#    define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueReadBuffer
-#    define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-#  endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
-#  define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-struct OpenCLPlatformDevice {
-  OpenCLPlatformDevice(cl_platform_id platform_id,
-                       const string &platform_name,
-                       cl_device_id device_id,
-                       cl_device_type device_type,
-                       const string &device_name,
-                       const string &hardware_id,
-                       const string &device_extensions)
-      : platform_id(platform_id),
-        platform_name(platform_name),
-        device_id(device_id),
-        device_type(device_type),
-        device_name(device_name),
-        hardware_id(hardware_id),
-        device_extensions(device_extensions)
-  {
-  }
-  cl_platform_id platform_id;
-  string platform_name;
-  cl_device_id device_id;
-  cl_device_type device_type;
-  string device_name;
-  string hardware_id;
-  string device_extensions;
-};
-
-/* Contains all static OpenCL helper functions. */
-class OpenCLInfo {
- public:
-  static cl_device_type device_type();
-  static bool use_debug();
-  static bool device_supported(const string &platform_name, const cl_device_id device_id);
-  static bool platform_version_check(cl_platform_id platform, string *error = NULL);
-  static bool device_version_check(cl_device_id device, string *error = NULL);
-  static bool get_device_version(cl_device_id device,
-                                 int *r_major,
-                                 int *r_minor,
-                                 string *error = NULL);
-  static string get_hardware_id(const string &platform_name, cl_device_id device_id);
-  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
-
-  /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
-  /* Platform information. */
-  static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
-  static cl_uint get_num_platforms();
-
-  static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
-  static vector<cl_platform_id> get_platforms();
-
-  static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
-  static string get_platform_name(cl_platform_id platform_id);
-
-  static bool get_num_platform_devices(cl_platform_id platform_id,
-                                       cl_device_type device_type,
-                                       cl_uint *num_devices,
-                                       cl_int *error = NULL);
-  static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
-
-  static bool get_platform_devices(cl_platform_id platform_id,
-                                   cl_device_type device_type,
-                                   vector<cl_device_id> *device_ids,
-                                   cl_int *error = NULL);
-  static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
-                                                   cl_device_type device_type);
-
-  /* Device information. */
-  static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
-
-  static string get_device_name(cl_device_id device_id);
-
-  static bool get_device_extensions(cl_device_id device_id,
-                                    string *device_extensions,
-                                    cl_int *error = NULL);
-
-  static string get_device_extensions(cl_device_id device_id);
-
-  static bool get_device_type(cl_device_id device_id,
-                              cl_device_type *device_type,
-                              cl_int *error = NULL);
-  static cl_device_type get_device_type(cl_device_id device_id);
-
-  static bool get_driver_version(cl_device_id device_id,
-                                 int *major,
-                                 int *minor,
-                                 cl_int *error = NULL);
-
-  static int mem_sub_ptr_alignment(cl_device_id device_id);
-
-  /* Get somewhat more readable device name.
-   * Main difference is AMD OpenCL here which only gives code name
-   * for the regular device name. This will give more sane device
-   * name using some extensions.
-   */
-  static string get_readable_device_name(cl_device_id device_id);
-};
-
-/* Thread safe cache for contexts and programs.
- */
-class OpenCLCache {
-  struct Slot {
-    struct ProgramEntry {
-      ProgramEntry();
-      ProgramEntry(const ProgramEntry &rhs);
-      ~ProgramEntry();
-      cl_program program;
-      thread_mutex *mutex;
-    };
-
-    Slot();
-    Slot(const Slot &rhs);
-    ~Slot();
-
-    thread_mutex *context_mutex;
-    cl_context context;
-    typedef map<ustring, ProgramEntry> EntryMap;
-    EntryMap programs;
-  };
-
-  /* key is combination of platform ID and device ID */
-  typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
-  /* map of Slot objects */
-  typedef map<PlatformDevicePair, Slot> CacheMap;
-  CacheMap cache;
-
-  /* MD5 hash of the kernel source. */
-  string kernel_md5;
-
-  thread_mutex cache_lock;
-  thread_mutex kernel_md5_lock;
-
-  /* lazy instantiate */
-  static OpenCLCache &global_instance();
-
- public:
-  enum ProgramName {
-    OCL_DEV_BASE_PROGRAM,
-    OCL_DEV_MEGAKERNEL_PROGRAM,
-  };
-
-  /* Lookup context in the cache. If this returns NULL, slot_locker
-   * will be holding a lock for the cache. slot_locker should refer to a
-   * default constructed thread_scoped_lock. */
-  static cl_context get_context(cl_platform_id platform,
-                                cl_device_id device,
-                                thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static cl_program get_program(cl_platform_id platform,
-                                cl_device_id device,
-                                ustring key,
-                                thread_scoped_lock &slot_locker);
-
-  /* Store context in the cache. You MUST have tried to get the item before storing to it. */
-  static void store_context(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_context context,
-                            thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static void store_program(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_program program,
-                            ustring key,
-                            thread_scoped_lock &slot_locker);
-
-  static string get_kernel_md5();
-};
-
-#  define opencl_device_assert(device, stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if ((device)->error_message() == "") { \
-          (device)->set_error(message); \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-#  define opencl_assert(stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if (error_msg == "") { \
-          error_msg = message; \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-class OpenCLDevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-
-  /* Task pool for required kernels (base, AO kernels during foreground rendering) */
-  TaskPool load_required_kernel_task_pool;
-  /* Task pool for optional kernels (feature kernels during foreground rendering) */
-  TaskPool load_kernel_task_pool;
-  std::atomic<int> load_kernel_num_compiling;
-
-  cl_context cxContext;
-  cl_command_queue cqCommandQueue;
-  cl_platform_id cpPlatform;
-  cl_device_id cdDevice;
-  cl_int ciErr;
-  int device_num;
-
-  class OpenCLProgram {
-   public:
-    OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
-    {
-    }
-    OpenCLProgram(OpenCLDevice *device,
-                  const string &program_name,
-                  const string &kernel_name,
-                  const string &kernel_build_options,
-                  bool use_stdout = true);
-    ~OpenCLProgram();
-
-    void add_kernel(ustring name);
-
-    /* Try to load the program from device cache or disk */
-    bool load();
-    /* Compile the kernel (first separate, fail-back to local). */
-    void compile();
-    /* Create the OpenCL kernels after loading or compiling */
-    void create_kernels();
-
-    bool is_loaded() const
-    {
-      return loaded;
-    }
-    const string &get_log() const
-    {
-      return log;
-    }
-    void report_error();
-
-    /* Wait until this kernel is available to be used
-     * It will return true when the kernel is available.
-     * It will return false when the kernel is not available
-     * or could not be loaded. */
-    bool wait_for_availability();
-
-    cl_kernel operator()();
-    cl_kernel operator()(ustring name);
-
-    void release();
-
-   private:
-    bool build_kernel(const string *debug_src);
-    /* Build the program by calling the own process.
-     * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
-     * build calls internally if they come from the same process.
-     * If that is not supported, this function just returns false.
-     */
-    bool compile_separate(const string &clbin);
-    /* Build the program by calling OpenCL directly. */
-    bool compile_kernel(const string *debug_src);
-    /* Loading and saving the program from/to disk. */
-    bool load_binary(const string &clbin, const string *debug_src = NULL);
-    bool save_binary(const string &clbin);
-
-    void add_log(const string &msg, bool is_debug);
-    void add_error(const string &msg);
-
-    bool loaded;
-    bool needs_compiling;
-
-    cl_program program;
-    OpenCLDevice *device;
-
-    /* Used for the OpenCLCache key. */
-    string program_name;
-
-    string kernel_file, kernel_build_options, device_md5;
-
-    bool use_stdout;
-    string log, error_msg;
-    string compile_output;
-
-    map<ustring, cl_kernel> kernels;
-  };
-
-  /* Container for all types of split programs. */
-  class OpenCLSplitPrograms {
-   public:
-    OpenCLDevice *device;
-    OpenCLProgram program_split;
-    OpenCLProgram program_lamp_emission;
-    OpenCLProgram program_do_volume;
-    OpenCLProgram program_indirect_background;
-    OpenCLProgram program_shader_eval;
-    OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-    OpenCLProgram program_subsurface_scatter;
-    OpenCLProgram program_direct_lighting;
-    OpenCLProgram program_shadow_blocked_ao;
-    OpenCLProgram program_shadow_blocked_dl;
-
-    OpenCLSplitPrograms(OpenCLDevice *device);
-    ~OpenCLSplitPrograms();
-
-    /* Load the kernels and put the created kernels in the given
-     * `programs` parameter. */
-    void load_kernels(vector<OpenCLProgram *> &programs,
-                      const DeviceRequestedFeatures &requested_features);
-  };
-
-  DeviceSplitKernel *split_kernel;
-
-  OpenCLProgram base_program;
-  OpenCLProgram bake_program;
-  OpenCLProgram displace_program;
-  OpenCLProgram background_program;
-  OpenCLProgram denoising_program;
-
-  OpenCLSplitPrograms kernel_programs;
-
-  typedef map<string, device_vector<uchar> *> ConstMemMap;
-  typedef map<string, device_ptr> MemMap;
-
-  ConstMemMap const_mem_map;
-  MemMap mem_map;
-
-  bool device_initialized;
-  string platform_name;
-  string device_name;
-
-  bool opencl_error(cl_int err);
-  void opencl_error(const string &message);
-  void opencl_assert_err(cl_int err, const char *where);
-
-  OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-  ~OpenCLDevice();
-
-  static void CL_CALLBACK context_notify_callback(const char *err_info,
-                                                  const void * /*private_info*/,
-                                                  size_t /*cb*/,
-                                                  void *user_data);
-
-  bool opencl_version_check();
-  OpenCLSplitPrograms *get_split_programs();
-
-  string device_md5_hash(string kernel_custom_build_options = "");
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  void load_required_kernels(const DeviceRequestedFeatures &requested_features);
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
-  DeviceKernelStatus get_active_kernel_switch_state();
-
-  /* Get the name of the opencl program for the given kernel */
-  const string get_opencl_program_name(const string &kernel_name);
-  /* Get the program file name to compile (*.cl) for the given kernel */
-  const string get_opencl_program_filename(const string &kernel_name);
-  string get_build_options(const DeviceRequestedFeatures &requested_features,
-                           const string &opencl_program_name);
-  /* Enable the default features to reduce recompilation events */
-  void enable_default_features(DeviceRequestedFeatures &features);
-
-  void mem_alloc(device_memory &mem);
-  void mem_copy_to(device_memory &mem);
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
-  void mem_zero(device_memory &mem);
-  void mem_free(device_memory &mem);
-
-  int mem_sub_ptr_alignment();
-
-  void const_copy_to(const char *name, void *host, size_t size);
-  void global_alloc(device_memory &mem);
-  void global_free(device_memory &mem);
-  void tex_alloc(device_texture &mem);
-  void tex_free(device_texture &mem);
-
-  size_t global_size_round_up(int group_size, int global_size);
-  void enqueue_kernel(cl_kernel kernel,
-                      size_t w,
-                      size_t h,
-                      bool x_workgroups = false,
-                      size_t max_workgroup_size = -1);
-  void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-  void shader(DeviceTask &task);
-  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
-  void bake(DeviceTask &task, RenderTile &tile);
-
-  void denoise(RenderTile &tile, DenoisingTask &denoising);
-
-  int get_split_task_count(DeviceTask & /*task*/)
-  {
-    return 1;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  void thread_run(DeviceTask &task);
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  virtual bool show_samples() const
-  {
-    return true;
-  }
-
- protected:
-  string kernel_build_options(const string *debug_src = NULL);
-
-  void mem_zero_kernel(device_ptr ptr, size_t size);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-  bool denoising_construct_transform(DenoisingTask *task);
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-  bool denoising_write_feature(int to_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
-  void mem_free_sub_ptr(device_ptr ptr);
-
-  class ArgumentWrapper {
-   public:
-    ArgumentWrapper() : size(0), pointer(NULL)
-    {
-    }
-
-    ArgumentWrapper(device_memory &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_vector<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_only_memory<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-    template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
-    {
-    }
-
-    ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
-    {
-    }
-
-    ArgumentWrapper(float argument)
-        : size(sizeof(float)), float_value(argument), pointer(&float_value)
-    {
-    }
-
-    size_t size;
-    int int_value;
-    float float_value;
-    void *pointer;
-  };
-
-  /* TODO(sergey): In the future we can use variadic templates, once
-   * C++0x is allowed. Should allow to clean this up a bit.
-   */
-  int kernel_set_args(cl_kernel kernel,
-                      int start_argument_index,
-                      const ArgumentWrapper &arg1 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg2 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg3 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg4 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg5 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg6 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg7 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg8 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg9 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg10 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg11 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg12 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg13 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg14 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg15 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg16 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg17 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg18 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg19 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg20 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg21 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg22 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg23 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg24 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg25 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg26 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg27 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg28 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg29 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg30 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg31 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg32 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg33 = ArgumentWrapper());
-
-  void release_kernel_safe(cl_kernel kernel);
-  void release_mem_object_safe(cl_mem mem);
-  void release_program_safe(cl_program program);
-
-  /* ** Those guys are for working around some compiler-specific bugs ** */
-
-  cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
-
-  void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
-
- private:
-  MemoryManager memory_manager;
-  friend class MemoryManager;
-
-  static_assert_align(TextureInfo, 16);
-  device_vector<TextureInfo> texture_info;
-
-  typedef map<string, device_memory *> TexturesMap;
-  TexturesMap textures;
-
-  bool textures_need_update;
-
- protected:
-  void flush_texture_buffers();
-
-  friend class OpenCLSplitKernel;
-  friend class OpenCLSplitKernelFunction;
-};
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background);
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
deleted file mode 100644
index 31a2265700c..00000000000
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-
-#  include "kernel/kernel_types.h"
-#  include "kernel/split/kernel_split_data_types.h"
-
-#  include "util/util_algorithm.h"
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct texture_slot_t {
-  texture_slot_t(const string &name, int slot) : name(name), slot(slot)
-  {
-  }
-  string name;
-  int slot;
-};
-
-static const string NON_SPLIT_KERNELS =
-    "denoising "
-    "base "
-    "background "
-    "displace ";
-
-static const string SPLIT_BUNDLE_KERNELS =
-    "data_init "
-    "path_init "
-    "state_buffer_size "
-    "scene_intersect "
-    "queue_enqueue "
-    "shader_setup "
-    "shader_sort "
-    "enqueue_inactive "
-    "next_iteration_setup "
-    "indirect_subsurface "
-    "buffer_update "
-    "adaptive_stopping "
-    "adaptive_filter_x "
-    "adaptive_filter_y "
-    "adaptive_adjust_samples";
-
-const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
-{
-  if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
-    return kernel_name;
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "split_bundle";
-  }
-  else {
-    return "split_" + kernel_name;
-  }
-}
-
-const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
-{
-  if (kernel_name == "denoising") {
-    return "filter.cl";
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "kernel_split_bundle.cl";
-  }
-  else {
-    return "kernel_" + kernel_name + ".cl";
-  }
-}
-
-/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
-{
-  features.use_transparent = true;
-  features.use_shadow_tricks = true;
-  features.use_principled = true;
-  features.use_denoising = true;
-
-  if (!background) {
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.use_camera_motion = false;
-    features.use_object_motion = false;
-  }
-}
-
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
-                                       const string &opencl_program_name)
-{
-  /* first check for non-split kernel programs */
-  if (opencl_program_name == "base" || opencl_program_name == "denoising") {
-    return "";
-  }
-  else if (opencl_program_name == "bake") {
-    /* Note: get_build_options for bake is only requested when baking is enabled.
-     * displace and background are always requested.
-     * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "displace") {
-    /* As displacement does not use any nodes from the Shading group (eg BSDF).
-     * We disable all features that are related to shading. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_baking = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_denoising = false;
-    features.use_principled = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "background") {
-    /* Background uses Background shading
-     * It is save to disable shadow features, subsurface and volumetric. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_baking = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_denoising = false;
-    /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
-     * Perhaps we should remove them in UI as it does not make any sense when
-     * rendering background. */
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.use_shader_raytrace = false;
-    features.use_patch_evaluation = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-
-  string build_options = "-D__SPLIT_KERNEL__ ";
-  /* Set compute device build option. */
-  cl_device_type device_type;
-  OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
-  assert(this->ciErr == CL_SUCCESS);
-  if (device_type == CL_DEVICE_TYPE_GPU) {
-    build_options += "-D__COMPUTE_DEVICE_GPU__ ";
-  }
-
-  DeviceRequestedFeatures nofeatures;
-  enable_default_features(nofeatures);
-
-  /* Add program specific optimized compile directives */
-  if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
-    build_options += nofeatures.get_build_options();
-  }
-  else {
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-
-    /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
-     * this also makes sure that the kernels that are build during baking can be reused
-     * when not doing any baking. */
-    features.use_baking = false;
-
-    /* Do not vary on shaders when program doesn't do any shading.
-     * We have bundled them in a single program. */
-    if (opencl_program_name == "split_bundle") {
-      features.max_nodes_group = 0;
-      features.nodes_features = 0;
-      features.use_shader_raytrace = false;
-    }
-
-    /* No specific settings, just add the regular ones */
-    build_options += features.get_build_options();
-  }
-
-  return build_options;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
-{
-  device = device_;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
-{
-  program_split.release();
-  program_lamp_emission.release();
-  program_do_volume.release();
-  program_indirect_background.release();
-  program_shader_eval.release();
-  program_holdout_emission_blurring_pathtermination_ao.release();
-  program_subsurface_scatter.release();
-  program_direct_lighting.release();
-  program_shadow_blocked_ao.release();
-  program_shadow_blocked_dl.release();
-}
-
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
-    vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
-{
-  if (!requested_features.use_baking) {
-#  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
-    program_split.add_kernel(ustring("path_trace_" #kernel_name));
-#  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
-    const string program_name_##kernel_name = "split_" #kernel_name; \
-    program_##kernel_name = OpenCLDevice::OpenCLProgram( \
-        device, \
-        program_name_##kernel_name, \
-        "kernel_" #kernel_name ".cl", \
-        device->get_build_options(requested_features, program_name_##kernel_name)); \
-    program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
-    programs.push_back(&program_##kernel_name);
-
-    /* Ordered with most complex kernels first, to reduce overall compile time. */
-    ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
-    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-    if (requested_features.use_volume) {
-      ADD_SPLIT_KERNEL_PROGRAM(do_volume);
-    }
-    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
-    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-
-    /* Quick kernels bundled in a single program to reduce overhead of starting
-     * Blender processes. */
-    program_split = OpenCLDevice::OpenCLProgram(
-        device,
-        "split_bundle",
-        "kernel_split_bundle.cl",
-        device->get_build_options(requested_features, "split_bundle"));
-
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
-    programs.push_back(&program_split);
-
-#  undef ADD_SPLIT_KERNEL_PROGRAM
-#  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
-  }
-}
-
-namespace {
-
-/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
-typedef struct KernelGlobalsDummy {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  SplitData split_data;
-  SplitParams split_param_data;
-} KernelGlobalsDummy;
-
-}  // namespace
-
-struct CachedSplitMemory {
-  int id;
-  device_memory *split_data;
-  device_memory *ray_state;
-  device_memory *queue_index;
-  device_memory *use_queues_flag;
-  device_memory *work_pools;
-  device_ptr *buffer;
-};
-
-class OpenCLSplitKernelFunction : public SplitKernelFunction {
- public:
-  OpenCLDevice *device;
-  OpenCLDevice::OpenCLProgram program;
-  CachedSplitMemory &cached_memory;
-  int cached_id;
-
-  OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
-      : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
-  {
-  }
-
-  ~OpenCLSplitKernelFunction()
-  {
-    program.release();
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
-  {
-    if (cached_id != cached_memory.id) {
-      cl_uint start_arg_index = device->kernel_set_args(
-          program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
-
-      device->set_kernel_arg_buffers(program(), &start_arg_index);
-
-      start_arg_index += device->kernel_set_args(program(),
-                                                 start_arg_index,
-                                                 *cached_memory.queue_index,
-                                                 *cached_memory.use_queues_flag,
-                                                 *cached_memory.work_pools,
-                                                 *cached_memory.buffer);
-
-      cached_id = cached_memory.id;
-    }
-
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           program(),
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    return true;
-  }
-};
-
-class OpenCLSplitKernel : public DeviceSplitKernel {
-  OpenCLDevice *device;
-  CachedSplitMemory cached_memory;
-
- public:
-  explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
-  {
-  }
-
-  virtual SplitKernelFunction *get_split_kernel_function(
-      const string &kernel_name, const DeviceRequestedFeatures &requested_features)
-  {
-    OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
-    const string program_name = device->get_opencl_program_name(kernel_name);
-    kernel->program = OpenCLDevice::OpenCLProgram(
-        device,
-        program_name,
-        device->get_opencl_program_filename(kernel_name),
-        device->get_build_options(requested_features, program_name));
-
-    kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
-    kernel->program.load();
-
-    if (!kernel->program.is_loaded()) {
-      delete kernel;
-      return NULL;
-    }
-
-    return kernel;
-  }
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
-  {
-    device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-    size_buffer.alloc(1);
-    size_buffer.zero_to_device();
-
-    uint threads = num_threads;
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_state_buffer_size = programs->program_split(
-        ustring("path_trace_state_buffer_size"));
-    device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
-    size_t global_size = 64;
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_state_buffer_size,
-                                           1,
-                                           NULL,
-                                           &global_size,
-                                           NULL,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    size_buffer.copy_from_device(0, 1, 1);
-    size_t size = size_buffer[0];
-    size_buffer.free();
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return 0;
-    }
-
-    return size;
-  }
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs)
-  {
-    cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
-    /* Set the range of samples to be processed for every ray in
-     * path-regeneration logic.
-     */
-    cl_int start_sample = rtile.start_sample;
-    cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
-    cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
-                                                      0,
-                                                      kernel_globals,
-                                                      kernel_data,
-                                                      split_data,
-                                                      num_global_elements,
-                                                      ray_state);
-
-    device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
-    start_arg_index += device->kernel_set_args(kernel_data_init,
-                                               start_arg_index,
-                                               start_sample,
-                                               end_sample,
-                                               rtile.x,
-                                               rtile.y,
-                                               rtile.w,
-                                               rtile.h,
-                                               rtile.offset,
-                                               rtile.stride,
-                                               queue_index,
-                                               dQueue_size,
-                                               use_queues_flag,
-                                               work_pool_wgs,
-                                               rtile.num_samples,
-                                               rtile.buffer);
-
-    /* Enqueue ckPathTraceKernel_data_init kernel. */
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_data_init,
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    cached_memory.split_data = &split_data;
-    cached_memory.ray_state = &ray_state;
-    cached_memory.queue_index = &queue_index;
-    cached_memory.use_queues_flag = &use_queues_flag;
-    cached_memory.work_pools = &work_pool_wgs;
-    cached_memory.buffer = &rtile.buffer;
-    cached_memory.id++;
-
-    return true;
-  }
-
-  virtual int2 split_kernel_local_size()
-  {
-    return make_int2(64, 1);
-  }
-
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask & /*task*/)
-  {
-    cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
-    /* Use small global size on CPU devices as it seems to be much faster. */
-    if (type == CL_DEVICE_TYPE_CPU) {
-      VLOG(1) << "Global size: (64, 64).";
-      return make_int2(64, 64);
-    }
-
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (DebugFlags().opencl.mem_limit) {
-      max_buffer_size = min(max_buffer_size,
-                            cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
-    }
-
-    VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
-            << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
-
-    /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
-    max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
-
-    size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
-    int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
-                                 (int)sqrt(num_elements));
-
-    if (device->info.description.find("Intel") != string::npos) {
-      global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
-    }
-
-    VLOG(1) << "Global size: " << global_size << ".";
-    return global_size;
-  }
-};
-
-bool OpenCLDevice::opencl_error(cl_int err)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    return true;
-  }
-
-  return false;
-}
-
-void OpenCLDevice::opencl_error(const string &message)
-{
-  if (error_msg == "")
-    error_msg = message;
-  fprintf(stderr, "%s\n", message.c_str());
-}
-
-void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf(
-        "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-#  ifndef NDEBUG
-    abort();
-#  endif
-  }
-}
-
-OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-    : Device(info, stats, profiler, background),
-      load_kernel_num_compiling(0),
-      kernel_programs(this),
-      memory_manager(this),
-      texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  cpPlatform = NULL;
-  cdDevice = NULL;
-  cxContext = NULL;
-  cqCommandQueue = NULL;
-  device_initialized = false;
-  textures_need_update = true;
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (usable_devices.size() == 0) {
-    opencl_error("OpenCL: no devices found.");
-    return;
-  }
-  assert(info.num < usable_devices.size());
-  OpenCLPlatformDevice &platform_device = usable_devices[info.num];
-  device_num = info.num;
-  cpPlatform = platform_device.platform_id;
-  cdDevice = platform_device.device_id;
-  platform_name = platform_device.platform_name;
-  device_name = platform_device.device_name;
-  VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
-          << device_name << ".";
-
-  {
-    /* try to use cached context */
-    thread_scoped_lock cache_locker;
-    cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
-    if (cxContext == NULL) {
-      /* create context properties array to specify platform */
-      const cl_context_properties context_props[] = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
-
-      /* create context */
-      cxContext = clCreateContext(
-          context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
-
-      if (opencl_error(ciErr)) {
-        opencl_error("OpenCL: clCreateContext failed");
-        return;
-      }
-
-      /* cache it */
-      OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
-    }
-  }
-
-  cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-  if (opencl_error(ciErr)) {
-    opencl_error("OpenCL: Error creating command queue");
-    return;
-  }
-
-  /* Allocate this right away so that texture_info
-   * is placed at offset 0 in the device memory buffers. */
-  texture_info.resize(1);
-  memory_manager.alloc("texture_info", texture_info);
-
-  device_initialized = true;
-
-  split_kernel = new OpenCLSplitKernel(this);
-}
-
-OpenCLDevice::~OpenCLDevice()
-{
-  task_pool.cancel();
-  load_required_kernel_task_pool.cancel();
-  load_kernel_task_pool.cancel();
-
-  memory_manager.free();
-
-  ConstMemMap::iterator mt;
-  for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-    delete mt->second;
-  }
-
-  base_program.release();
-  bake_program.release();
-  displace_program.release();
-  background_program.release();
-  denoising_program.release();
-
-  if (cqCommandQueue)
-    clReleaseCommandQueue(cqCommandQueue);
-  if (cxContext)
-    clReleaseContext(cxContext);
-
-  delete split_kernel;
-}
-
-void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
-                                                       const void * /*private_info*/,
-                                                       size_t /*cb*/,
-                                                       void *user_data)
-{
-  string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
-  fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
-}
-
-bool OpenCLDevice::opencl_version_check()
-{
-  string error;
-  if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  return true;
-}
-
-string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
-{
-  MD5Hash md5;
-  char version[256], driver[256], name[256], vendor[256];
-
-  clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-  clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
-  md5.append((uint8_t *)vendor, strlen(vendor));
-  md5.append((uint8_t *)version, strlen(version));
-  md5.append((uint8_t *)name, strlen(name));
-  md5.append((uint8_t *)driver, strlen(driver));
-
-  string options = kernel_build_options();
-  options += kernel_custom_build_options;
-  md5.append((uint8_t *)options.c_str(), options.size());
-
-  return md5.get_hex();
-}
-
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
-  /* Verify if device was initialized. */
-  if (!device_initialized) {
-    fprintf(stderr, "OpenCL: failed to initialize device.\n");
-    return false;
-  }
-
-  /* Verify we have right opencl version. */
-  if (!opencl_version_check())
-    return false;
-
-  load_required_kernels(requested_features);
-
-  vector<OpenCLProgram *> programs;
-  kernel_programs.load_kernels(programs, requested_features);
-
-  if (!requested_features.use_baking && requested_features.use_denoising) {
-    denoising_program = OpenCLProgram(
-        this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
-    denoising_program.add_kernel(ustring("filter_divide_shadow"));
-    denoising_program.add_kernel(ustring("filter_get_feature"));
-    denoising_program.add_kernel(ustring("filter_write_feature"));
-    denoising_program.add_kernel(ustring("filter_detect_outliers"));
-    denoising_program.add_kernel(ustring("filter_combine_halves"));
-    denoising_program.add_kernel(ustring("filter_construct_transform"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
-    denoising_program.add_kernel(ustring("filter_nlm_blur"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
-    denoising_program.add_kernel(ustring("filter_nlm_update_output"));
-    denoising_program.add_kernel(ustring("filter_nlm_normalize"));
-    denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
-    denoising_program.add_kernel(ustring("filter_finalize"));
-    programs.push_back(&denoising_program);
-  }
-
-  load_required_kernel_task_pool.wait_work();
-
-  /* Parallel compilation of Cycles kernels, this launches multiple
-   * processes to workaround OpenCL frameworks serializing the calls
-   * internally within a single process. */
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_kernel_num_compiling++;
-      load_kernel_task_pool.push([=] {
-        program->compile();
-        load_kernel_num_compiling--;
-      });
-    }
-  }
-  return true;
-}
-
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  vector<OpenCLProgram *> programs;
-  base_program = OpenCLProgram(
-      this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
-  base_program.add_kernel(ustring("convert_to_byte"));
-  base_program.add_kernel(ustring("convert_to_half_float"));
-  base_program.add_kernel(ustring("zero_buffer"));
-  programs.push_back(&base_program);
-
-  if (requested_features.use_true_displacement) {
-    displace_program = OpenCLProgram(
-        this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
-    displace_program.add_kernel(ustring("displace"));
-    programs.push_back(&displace_program);
-  }
-
-  if (requested_features.use_background_light) {
-    background_program = OpenCLProgram(this,
-                                       "background",
-                                       "kernel_background.cl",
-                                       get_build_options(requested_features, "background"));
-    background_program.add_kernel(ustring("background"));
-    programs.push_back(&background_program);
-  }
-
-  if (requested_features.use_baking) {
-    bake_program = OpenCLProgram(
-        this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
-    bake_program.add_kernel(ustring("bake"));
-    programs.push_back(&bake_program);
-  }
-
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-    }
-  }
-}
-
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
-{
-  if (requested_features.use_baking) {
-    /* For baking, kernels have already been loaded in load_required_kernels(). */
-    return true;
-  }
-
-  load_kernel_task_pool.wait_work();
-  return split_kernel->load_kernels(requested_features);
-}
-
-OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
-{
-  return &kernel_programs;
-}
-
-DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
-{
-  return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-}
-
-void OpenCLDevice::mem_alloc(device_memory &mem)
-{
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-  }
-
-  size_t size = mem.memory_size();
-
-  /* check there is enough memory available for the allocation */
-  cl_ulong max_alloc_size = 0;
-  clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
-  if (DebugFlags().opencl.mem_limit) {
-    max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
-  }
-
-  if (size > max_alloc_size) {
-    string error = "Scene too complex to fit in available memory.";
-    if (mem.name != NULL) {
-      error += string_printf(" (allocating buffer %s failed.)", mem.name);
-    }
-    set_error(error);
-
-    return;
-  }
-
-  cl_mem_flags mem_flag;
-  void *mem_ptr = NULL;
-
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  /* Zero-size allocation might be invoked by render, but not really
-   * supported by OpenCL. Using NULL as device pointer also doesn't really
-   * work for some reason, so for the time being we'll use special case
-   * will null_mem buffer.
-   */
-  if (size != 0) {
-    mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
-    opencl_assert_err(ciErr, "clCreateBuffer");
-  }
-  else {
-    mem.device_pointer = 0;
-  }
-
-  stats.mem_alloc(size);
-  mem.device_size = size;
-}
-
-void OpenCLDevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    /* this is blocking */
-    size_t size = mem.memory_size();
-    if (size != 0) {
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         size,
-                                         mem.host_pointer,
-                                         0,
-                                         NULL,
-                                         NULL));
-    }
-  }
-}
-
-void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  size_t offset = elem * y * w;
-  size_t size = elem * w * h;
-  assert(size != 0);
-  opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
-                                    CL_MEM_PTR(mem.device_pointer),
-                                    CL_TRUE,
-                                    offset,
-                                    size,
-                                    (uchar *)mem.host_pointer + offset,
-                                    0,
-                                    NULL,
-                                    NULL));
-}
-
-void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
-{
-  base_program.wait_for_availability();
-  cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-  size_t global_size[] = {1024, 1024};
-  size_t num_threads = global_size[0] * global_size[1];
-
-  cl_mem d_buffer = CL_MEM_PTR(mem);
-  cl_ulong d_offset = 0;
-  cl_ulong d_size = 0;
-
-  while (d_offset < size) {
-    d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
-
-    kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-    ciErr = clEnqueueNDRangeKernel(
-        cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
-    opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-    d_offset += d_size;
-  }
-}
-
-void OpenCLDevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-
-  if (mem.device_pointer) {
-    if (base_program.is_loaded()) {
-      mem_zero_kernel(mem.device_pointer, mem.memory_size());
-    }
-
-    if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-
-    if (!base_program.is_loaded()) {
-      void *zero = mem.host_pointer;
-
-      if (!mem.host_pointer) {
-        zero = util_aligned_malloc(mem.memory_size(), 16);
-        memset(zero, 0, mem.memory_size());
-      }
-
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         mem.memory_size(),
-                                         zero,
-                                         0,
-                                         NULL,
-                                         NULL));
-
-      if (!mem.host_pointer) {
-        util_aligned_free(zero);
-      }
-    }
-  }
-}
-
-void OpenCLDevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    if (mem.device_pointer) {
-      if (mem.device_pointer != 0) {
-        opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
-      }
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-}
-
-int OpenCLDevice::mem_sub_ptr_alignment()
-{
-  return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
-}
-
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
-{
-  cl_mem_flags mem_flag;
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  cl_buffer_region info;
-  info.origin = mem.memory_elements_size(offset);
-  info.size = mem.memory_elements_size(size);
-
-  device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
-      CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
-  opencl_assert_err(ciErr, "clCreateSubBuffer");
-  return sub_buf;
-}
-
-void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
-{
-  if (device_pointer != 0) {
-    opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
-  }
-}
-
-void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  ConstMemMap::iterator i = const_mem_map.find(name);
-  device_vector<uchar> *data;
-
-  if (i == const_mem_map.end()) {
-    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
-    data->alloc(size);
-    const_mem_map.insert(ConstMemMap::value_type(name, data));
-  }
-  else {
-    data = i->second;
-  }
-
-  memcpy(data->data(), host, size);
-  data->copy_to_device();
-}
-
-void OpenCLDevice::global_alloc(device_memory &mem)
-{
-  VLOG(1) << "Global memory allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::global_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    mem.device_pointer = 0;
-
-    if (memory_manager.free(mem)) {
-      textures_need_update = true;
-    }
-
-    foreach (TexturesMap::value_type &value, textures) {
-      if (value.second == &mem) {
-        textures.erase(value.first);
-        break;
-      }
-    }
-  }
-}
-
-void OpenCLDevice::tex_alloc(device_texture &mem)
-{
-  VLOG(1) << "Texture allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::tex_free(device_texture &mem)
-{
-  global_free(mem);
-}
-
-size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
-{
-  int r = global_size % group_size;
-  return global_size + ((r == 0) ? 0 : group_size - r);
-}
-
-void OpenCLDevice::enqueue_kernel(
-    cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
-{
-  size_t workgroup_size, max_work_items[3];
-
-  clGetKernelWorkGroupInfo(
-      kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
-  clGetDeviceInfo(
-      cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
-
-  if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
-    workgroup_size = max_workgroup_size;
-  }
-
-  /* Try to divide evenly over 2 dimensions. */
-  size_t local_size[2];
-  if (x_workgroups) {
-    local_size[0] = workgroup_size;
-    local_size[1] = 1;
-  }
-  else {
-    size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-    local_size[0] = local_size[1] = sqrt_workgroup_size;
-  }
-
-  /* Some implementations have max size 1 on 2nd dimension. */
-  if (local_size[1] > max_work_items[1]) {
-    local_size[0] = workgroup_size / max_work_items[1];
-    local_size[1] = max_work_items[1];
-  }
-
-  size_t global_size[2] = {global_size_round_up(local_size[0], w),
-                           global_size_round_up(local_size[1], h)};
-
-  /* Vertical size of 1 is coming from bake/shade kernels where we should
-   * not round anything up because otherwise we'll either be doing too
-   * much work per pixel (if we don't check global ID on Y axis) or will
-   * be checking for global ID to always have Y of 0.
-   */
-  if (h == 1) {
-    global_size[h] = 1;
-  }
-
-  /* run kernel */
-  opencl_assert(
-      clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
-  opencl_assert(clFlush(cqCommandQueue));
-}
-
-void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-{
-  cl_mem ptr;
-
-  MemMap::iterator i = mem_map.find(name);
-  if (i != mem_map.end()) {
-    ptr = CL_MEM_PTR(i->second);
-  }
-  else {
-    ptr = 0;
-  }
-
-  opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
-}
-
-void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  flush_texture_buffers();
-
-  memory_manager.set_kernel_arg_buffers(kernel, narg);
-}
-
-void OpenCLDevice::flush_texture_buffers()
-{
-  if (!textures_need_update) {
-    return;
-  }
-  textures_need_update = false;
-
-  /* Setup slots for textures. */
-  int num_slots = 0;
-
-  vector<texture_slot_t> texture_slots;
-
-#  define KERNEL_TEX(type, name) \
-    if (textures.find(#name) != textures.end()) { \
-      texture_slots.push_back(texture_slot_t(#name, num_slots)); \
-    } \
-    num_slots++;
-#  include "kernel/kernel_textures.h"
-
-  int num_data_slots = num_slots;
-
-  foreach (TexturesMap::value_type &tex, textures) {
-    string name = tex.first;
-    device_memory *mem = tex.second;
-
-    if (mem->type == MEM_TEXTURE) {
-      const uint id = ((device_texture *)mem)->slot;
-      texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
-      num_slots = max(num_slots, num_data_slots + id + 1);
-    }
-  }
-
-  /* Realloc texture descriptors buffer. */
-  memory_manager.free(texture_info);
-  texture_info.resize(num_slots);
-  memory_manager.alloc("texture_info", texture_info);
-
-  /* Fill in descriptors */
-  foreach (texture_slot_t &slot, texture_slots) {
-    device_memory *mem = textures[slot.name];
-    TextureInfo &info = texture_info[slot.slot];
-
-    MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-
-    if (mem->type == MEM_TEXTURE) {
-      info = ((device_texture *)mem)->info;
-    }
-    else {
-      memset(&info, 0, sizeof(TextureInfo));
-    }
-
-    info.data = desc.offset;
-    info.cl_buffer = desc.device_buffer;
-  }
-
-  /* Force write of descriptors. */
-  memory_manager.free(texture_info);
-  memory_manager.alloc("texture_info", texture_info);
-}
-
-void OpenCLDevice::thread_run(DeviceTask &task)
-{
-  flush_texture_buffers();
-
-  if (task.type == DeviceTask::RENDER) {
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    /* Allocate buffer for kernel globals */
-    device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    /* Keep rendering tiles until done. */
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        assert(tile.task == RenderTile::PATH_TRACE);
-        scoped_timer timer(&tile.buffers->render_time);
-
-        split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
-
-        /* Complete kernel execution before release tile. */
-        /* This helps in multi-device render;
-         * The device that reaches the critical-section function
-         * release_tile waits (stalling other devices from entering
-         * release_tile) for all kernels to complete. If device1 (a
-         * slow-render device) reaches release_tile first then it would
-         * stall device2 (a fast-render device) from proceeding to render
-         * next tile.
-         */
-        clFinish(cqCommandQueue);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        bake(task, tile);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-        denoise(tile, denoising);
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-    }
-
-    kgbuffer.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-  }
-  else if (task.type == DeviceTask::FILM_CONVERT) {
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void OpenCLDevice::film_convert(DeviceTask &task,
-                                device_ptr buffer,
-                                device_ptr rgba_byte,
-                                device_ptr rgba_half)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
-  cl_mem d_buffer = CL_MEM_PTR(buffer);
-  cl_int d_x = task.x;
-  cl_int d_y = task.y;
-  cl_int d_w = task.w;
-  cl_int d_h = task.h;
-  cl_float d_sample_scale = 1.0f / (task.sample + 1);
-  cl_int d_offset = task.offset;
-  cl_int d_stride = task.stride;
-
-  cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
-                                                base_program(ustring("convert_to_half_float"));
-
-  cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
-
-  set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(ckFilmConvertKernel,
-                                     start_arg_index,
-                                     d_sample_scale,
-                                     d_x,
-                                     d_y,
-                                     d_w,
-                                     d_h,
-                                     d_offset,
-                                     d_stride);
-
-  enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-}
-
-bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
-                                             device_ptr guide_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr out_ptr,
-                                             DenoisingTask *task)
-{
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  device_sub_ptr weightAccum(
-      task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
-  cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem out_mem = CL_MEM_PTR(out_ptr);
-  cl_mem scale_mem = NULL;
-
-  mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
-  mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
-  cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  guide_mem,
-                  variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  channel_offset,
-                  0,
-                  a,
-                  k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(ckNLMUpdateOutput,
-                  0,
-                  blurDifference_mem,
-                  image_mem,
-                  out_mem,
-                  weightAccum_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  channel_offset,
-                  r,
-                  f);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
-
-  kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
-  enqueue_kernel(ckNLMNormalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
-{
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
-  int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterConstructTransform,
-                  arg_ofs,
-                  transform_mem,
-                  rank_mem,
-                  task->filter_area,
-                  task->rect,
-                  task->buffer.pass_stride,
-                  task->buffer.frame_stride,
-                  use_time,
-                  task->radius,
-                  task->pca_threshold);
-
-  enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
-                                        device_ptr color_variance_ptr,
-                                        device_ptr scale_ptr,
-                                        int frame,
-                                        DenoisingTask *task)
-{
-  cl_mem color_mem = CL_MEM_PTR(color_ptr);
-  cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-  cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  int r = task->radius;
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  color_mem,
-                  color_variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  pass_stride,
-                  frame_offset,
-                  1.0f,
-                  task->nlm_k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(ckNLMConstructGramian,
-                  0,
-                  t,
-                  blurDifference_mem,
-                  buffer_mem,
-                  transform_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->reconstruction_state.filter_window,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  4,
-                  frame_offset,
-                  use_time);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-
-  kernel_set_args(ckFinalize,
-                  0,
-                  output_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->filter_area,
-                  task->reconstruction_state.buffer_params,
-                  task->render_buffer.samples);
-  enqueue_kernel(ckFinalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
-                                            device_ptr b_ptr,
-                                            device_ptr mean_ptr,
-                                            device_ptr variance_ptr,
-                                            int r,
-                                            int4 rect,
-                                            DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
-  kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
-  enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
-                                           device_ptr b_ptr,
-                                           device_ptr sample_variance_ptr,
-                                           device_ptr sv_variance_ptr,
-                                           device_ptr buffer_variance_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
-  cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
-  cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
-  int arg_ofs = kernel_set_args(
-      ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterDivideShadow,
-                  arg_ofs,
-                  a_mem,
-                  b_mem,
-                  sample_variance_mem,
-                  sv_variance_mem,
-                  buffer_variance_mem,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_get_feature(int mean_offset,
-                                         int variance_offset,
-                                         device_ptr mean_ptr,
-                                         device_ptr variance_ptr,
-                                         float scale,
-                                         DenoisingTask *task)
-{
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
-  int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterGetFeature,
-                  arg_ofs,
-                  mean_offset,
-                  variance_offset,
-                  mean_mem,
-                  variance_mem,
-                  scale,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_write_feature(int out_offset,
-                                           device_ptr from_ptr,
-                                           device_ptr buffer_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem from_mem = CL_MEM_PTR(from_ptr);
-  cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
-  cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
-  kernel_set_args(ckFilterWriteFeature,
-                  0,
-                  task->render_buffer.samples,
-                  task->reconstruction_state.buffer_params,
-                  task->filter_area,
-                  from_mem,
-                  buffer_mem,
-                  out_offset,
-                  task->rect);
-  enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr depth_ptr,
-                                             device_ptr output_ptr,
-                                             DenoisingTask *task)
-{
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
-  cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
-  kernel_set_args(ckFilterDetectOutliers,
-                  0,
-                  image_mem,
-                  variance_mem,
-                  depth_mem,
-                  output_mem,
-                  task->rect,
-                  task->buffer.pass_stride);
-  enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &OpenCLDevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void OpenCLDevice::shader(DeviceTask &task)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_input = CL_MEM_PTR(task.shader_input);
-  cl_mem d_output = CL_MEM_PTR(task.shader_output);
-  cl_int d_shader_eval_type = task.shader_eval_type;
-  cl_int d_shader_filter = task.shader_filter;
-  cl_int d_shader_x = task.shader_x;
-  cl_int d_shader_w = task.shader_w;
-  cl_int d_offset = task.offset;
-
-  OpenCLDevice::OpenCLProgram *program = &background_program;
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    program = &displace_program;
-  }
-  program->wait_for_availability();
-  cl_kernel kernel = (*program)();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
-  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-    start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
-  }
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
-
-  for (int sample = 0; sample < task.num_samples; sample++) {
-
-    if (task.get_cancel())
-      break;
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, task.shader_w, 1);
-
-    clFinish(cqCommandQueue);
-
-    task.update_progress(NULL);
-  }
-}
-
-void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  /* Cast arguments to cl types. */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-  cl_int d_x = rtile.x;
-  cl_int d_y = rtile.y;
-  cl_int d_w = rtile.w;
-  cl_int d_h = rtile.h;
-  cl_int d_offset = rtile.offset;
-  cl_int d_stride = rtile.stride;
-
-  bake_program.wait_for_availability();
-  cl_kernel kernel = bake_program();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(
-      kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
-
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample; sample++) {
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, d_w, d_h);
-    clFinish(cqCommandQueue);
-
-    rtile.sample = sample + 1;
-
-    task.update_progress(&rtile, rtile.w * rtile.h);
-  }
-}
-
-static bool kernel_build_opencl_2(cl_device_id cdDevice)
-{
-  /* Build with OpenCL 2.0 if available, this improves performance
-   * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
-   * Note that OpenCL selects the highest 1.x version by default,
-   * only for 2.0 do we need the explicit compiler flag. */
-  int version_major, version_minor;
-  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
-    if (version_major >= 2) {
-      /* This appears to trigger a driver bug in Radeon RX cards with certain
-       * driver version, so don't use OpenCL 2.0 for those. */
-      string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
-      if (string_startswith(device_name, "Radeon RX 4") ||
-          string_startswith(device_name, "Radeon (TM) RX 4") ||
-          string_startswith(device_name, "Radeon RX 5") ||
-          string_startswith(device_name, "Radeon (TM) RX 5")) {
-        char version[256] = "";
-        int driver_major, driver_minor;
-        clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-        if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
-          return !(driver_major == 3075 && driver_minor <= 12);
-        }
-      }
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-string OpenCLDevice::kernel_build_options(const string *debug_src)
-{
-  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
-  if (kernel_build_opencl_2(cdDevice)) {
-    build_options += "-cl-std=CL2.0 ";
-  }
-
-  if (platform_name == "NVIDIA CUDA") {
-    build_options +=
-        "-D__KERNEL_OPENCL_NVIDIA__ "
-        "-cl-nv-maxrregcount=32 "
-        "-cl-nv-verbose ";
-
-    uint compute_capability_major, compute_capability_minor;
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_major,
-                    NULL);
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_minor,
-                    NULL);
-
-    build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
-                                   compute_capability_major * 100 + compute_capability_minor * 10);
-  }
-
-  else if (platform_name == "Apple")
-    build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
-  else if (platform_name == "AMD Accelerated Parallel Processing")
-    build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
-  else if (platform_name == "Intel(R) OpenCL") {
-    build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
-    /* Options for gdb source level kernel debugging.
-     * this segfaults on linux currently.
-     */
-    if (OpenCLInfo::use_debug() && debug_src)
-      build_options += "-g -s \"" + *debug_src + "\" ";
-  }
-
-  if (info.has_half_images) {
-    build_options += "-D__KERNEL_CL_KHR_FP16__ ";
-  }
-
-  if (OpenCLInfo::use_debug()) {
-    build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-  }
-
-#  ifdef WITH_NANOVDB
-  if (info.has_nanovdb) {
-    build_options += "-DWITH_NANOVDB ";
-  }
-#  endif
-
-  return build_options;
-}
-
-/* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
-int OpenCLDevice::kernel_set_args(cl_kernel kernel,
-                                  int start_argument_index,
-                                  const ArgumentWrapper &arg1,
-                                  const ArgumentWrapper &arg2,
-                                  const ArgumentWrapper &arg3,
-                                  const ArgumentWrapper &arg4,
-                                  const ArgumentWrapper &arg5,
-                                  const ArgumentWrapper &arg6,
-                                  const ArgumentWrapper &arg7,
-                                  const ArgumentWrapper &arg8,
-                                  const ArgumentWrapper &arg9,
-                                  const ArgumentWrapper &arg10,
-                                  const ArgumentWrapper &arg11,
-                                  const ArgumentWrapper &arg12,
-                                  const ArgumentWrapper &arg13,
-                                  const ArgumentWrapper &arg14,
-                                  const ArgumentWrapper &arg15,
-                                  const ArgumentWrapper &arg16,
-                                  const ArgumentWrapper &arg17,
-                                  const ArgumentWrapper &arg18,
-                                  const ArgumentWrapper &arg19,
-                                  const ArgumentWrapper &arg20,
-                                  const ArgumentWrapper &arg21,
-                                  const ArgumentWrapper &arg22,
-                                  const ArgumentWrapper &arg23,
-                                  const ArgumentWrapper &arg24,
-                                  const ArgumentWrapper &arg25,
-                                  const ArgumentWrapper &arg26,
-                                  const ArgumentWrapper &arg27,
-                                  const ArgumentWrapper &arg28,
-                                  const ArgumentWrapper &arg29,
-                                  const ArgumentWrapper &arg30,
-                                  const ArgumentWrapper &arg31,
-                                  const ArgumentWrapper &arg32,
-                                  const ArgumentWrapper &arg33)
-{
-  int current_arg_index = 0;
-#  define FAKE_VARARG_HANDLE_ARG(arg) \
-    do { \
-      if (arg.pointer != NULL) { \
-        opencl_assert(clSetKernelArg( \
-            kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
-        ++current_arg_index; \
-      } \
-      else { \
-        return current_arg_index; \
-      } \
-    } while (false)
-  FAKE_VARARG_HANDLE_ARG(arg1);
-  FAKE_VARARG_HANDLE_ARG(arg2);
-  FAKE_VARARG_HANDLE_ARG(arg3);
-  FAKE_VARARG_HANDLE_ARG(arg4);
-  FAKE_VARARG_HANDLE_ARG(arg5);
-  FAKE_VARARG_HANDLE_ARG(arg6);
-  FAKE_VARARG_HANDLE_ARG(arg7);
-  FAKE_VARARG_HANDLE_ARG(arg8);
-  FAKE_VARARG_HANDLE_ARG(arg9);
-  FAKE_VARARG_HANDLE_ARG(arg10);
-  FAKE_VARARG_HANDLE_ARG(arg11);
-  FAKE_VARARG_HANDLE_ARG(arg12);
-  FAKE_VARARG_HANDLE_ARG(arg13);
-  FAKE_VARARG_HANDLE_ARG(arg14);
-  FAKE_VARARG_HANDLE_ARG(arg15);
-  FAKE_VARARG_HANDLE_ARG(arg16);
-  FAKE_VARARG_HANDLE_ARG(arg17);
-  FAKE_VARARG_HANDLE_ARG(arg18);
-  FAKE_VARARG_HANDLE_ARG(arg19);
-  FAKE_VARARG_HANDLE_ARG(arg20);
-  FAKE_VARARG_HANDLE_ARG(arg21);
-  FAKE_VARARG_HANDLE_ARG(arg22);
-  FAKE_VARARG_HANDLE_ARG(arg23);
-  FAKE_VARARG_HANDLE_ARG(arg24);
-  FAKE_VARARG_HANDLE_ARG(arg25);
-  FAKE_VARARG_HANDLE_ARG(arg26);
-  FAKE_VARARG_HANDLE_ARG(arg27);
-  FAKE_VARARG_HANDLE_ARG(arg28);
-  FAKE_VARARG_HANDLE_ARG(arg29);
-  FAKE_VARARG_HANDLE_ARG(arg30);
-  FAKE_VARARG_HANDLE_ARG(arg31);
-  FAKE_VARARG_HANDLE_ARG(arg32);
-  FAKE_VARARG_HANDLE_ARG(arg33);
-#  undef FAKE_VARARG_HANDLE_ARG
-  return current_arg_index;
-}
-
-void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
-{
-  if (kernel) {
-    clReleaseKernel(kernel);
-  }
-}
-
-void OpenCLDevice::release_mem_object_safe(cl_mem mem)
-{
-  if (mem != NULL) {
-    clReleaseMemObject(mem);
-  }
-}
-
-void OpenCLDevice::release_program_safe(cl_program program)
-{
-  if (program) {
-    clReleaseProgram(program);
-  }
-}
-
-/* ** Those guys are for working around some compiler-specific bugs ** */
-
-cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
-{
-  return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
-}
-
-void OpenCLDevice::store_cached_kernel(cl_program program,
-                                       ustring key,
-                                       thread_scoped_lock &cache_locker)
-{
-  OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
-}
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background)
-{
-  return new OpenCLDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
deleted file mode 100644
index 4330e07cb37..00000000000
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "util/util_foreach.h"
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
-{
-  allocations.push_back(&allocation);
-}
-
-void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
-{
-  bool need_realloc = false;
-
-  /* Calculate total size and remove any freed. */
-  size_t total_size = 0;
-
-  for (int i = allocations.size() - 1; i >= 0; i--) {
-    Allocation *allocation = allocations[i];
-
-    /* Remove allocations that have been freed. */
-    if (!allocation->mem || allocation->mem->memory_size() == 0) {
-      allocation->device_buffer = NULL;
-      allocation->size = 0;
-
-      allocations.erase(allocations.begin() + i);
-
-      need_realloc = true;
-
-      continue;
-    }
-
-    /* Get actual size for allocation. */
-    size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
-    if (allocation->size != alloc_size) {
-      /* Allocation is either new or resized. */
-      allocation->size = alloc_size;
-      allocation->needs_copy_to_device = true;
-
-      need_realloc = true;
-    }
-
-    total_size += alloc_size;
-  }
-
-  /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
-  total_size = std::max(total_size, (size_t)16);
-
-  if (need_realloc) {
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (total_size > max_buffer_size) {
-      device->set_error("Scene too complex to fit in available memory.");
-      return;
-    }
-
-    device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
-                                                                          "memory manager buffer");
-
-    new_buffer->alloc_to_device(total_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(new_buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-      else {
-        /* Fast copy from memory already on device. */
-        opencl_device_assert(device,
-                             clEnqueueCopyBuffer(device->cqCommandQueue,
-                                                 CL_MEM_PTR(buffer->device_pointer),
-                                                 CL_MEM_PTR(new_buffer->device_pointer),
-                                                 allocation->desc.offset,
-                                                 offset,
-                                                 allocation->mem->memory_size(),
-                                                 0,
-                                                 NULL,
-                                                 NULL));
-      }
-
-      allocation->desc.offset = offset;
-      offset += allocation->size;
-    }
-
-    delete buffer;
-
-    buffer = new_buffer;
-  }
-  else {
-    assert(total_size == buffer->data_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-
-      offset += allocation->size;
-    }
-  }
-
-  /* Not really necessary, but seems to improve responsiveness for some reason. */
-  clFinish(device->cqCommandQueue);
-}
-
-void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
-{
-  buffer->free();
-}
-
-MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
-{
-  DeviceBuffer *smallest = device_buffers;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.size < smallest->size) {
-      smallest = &device_buffer;
-    }
-  }
-
-  return smallest;
-}
-
-MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
-  }
-}
-
-void MemoryManager::free()
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.free(device);
-  }
-}
-
-void MemoryManager::alloc(const char *name, device_memory &mem)
-{
-  Allocation &allocation = allocations[name];
-
-  allocation.mem = &mem;
-  allocation.needs_copy_to_device = true;
-
-  if (!allocation.device_buffer) {
-    DeviceBuffer *device_buffer = smallest_device_buffer();
-    allocation.device_buffer = device_buffer;
-
-    allocation.desc.device_buffer = device_buffer - device_buffers;
-
-    device_buffer->add_allocation(allocation);
-
-    device_buffer->size += mem.memory_size();
-  }
-
-  need_update = true;
-}
-
-bool MemoryManager::free(device_memory &mem)
-{
-  foreach (AllocationsMap::value_type &value, allocations) {
-    Allocation &allocation = value.second;
-    if (allocation.mem == &mem) {
-
-      allocation.device_buffer->size -= mem.memory_size();
-
-      allocation.mem = NULL;
-      allocation.needs_copy_to_device = false;
-
-      need_update = true;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
-{
-  update_device_memory();
-
-  Allocation &allocation = allocations[name];
-  return allocation.desc;
-}
-
-void MemoryManager::update_device_memory()
-{
-  if (!need_update) {
-    return;
-  }
-
-  need_update = false;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.update_device_memory(device);
-  }
-}
-
-void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  update_device_memory();
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.buffer->device_pointer) {
-      device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
-    }
-    else {
-      device->kernel_set_args(kernel, (*narg)++);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
deleted file mode 100644
index 23624f837a6..00000000000
--- a/intern/cycles/device/opencl/memory_manager.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "device/device.h"
-
-#include "util/util_map.h"
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-#include "clew.h"
-
-CCL_NAMESPACE_BEGIN
-
-class OpenCLDevice;
-
-class MemoryManager {
- public:
-  static const int NUM_DEVICE_BUFFERS = 8;
-
-  struct BufferDescriptor {
-    uint device_buffer;
-    cl_ulong offset;
-  };
-
- private:
-  struct DeviceBuffer;
-
-  struct Allocation {
-    device_memory *mem;
-
-    DeviceBuffer *device_buffer;
-    size_t size; /* Size of actual allocation, may be larger than requested. */
-
-    BufferDescriptor desc;
-
-    bool needs_copy_to_device;
-
-    Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
-    {
-    }
-  };
-
-  struct DeviceBuffer {
-    device_only_memory<uchar> *buffer;
-    vector<Allocation *> allocations;
-    size_t size; /* Size of all allocations. */
-
-    DeviceBuffer() : buffer(NULL), size(0)
-    {
-    }
-
-    ~DeviceBuffer()
-    {
-      delete buffer;
-      buffer = NULL;
-    }
-
-    void add_allocation(Allocation &allocation);
-
-    void update_device_memory(OpenCLDevice *device);
-
-    void free(OpenCLDevice *device);
-  };
-
-  OpenCLDevice *device;
-
-  DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
-
-  typedef unordered_map<string, Allocation> AllocationsMap;
-  AllocationsMap allocations;
-
-  bool need_update;
-
-  DeviceBuffer *smallest_device_buffer();
-
- public:
-  MemoryManager(OpenCLDevice *device);
-
-  void free(); /* Free all memory. */
-
-  void alloc(const char *name, device_memory &mem);
-  bool free(device_memory &mem);
-
-  BufferDescriptor get_descriptor(string name);
-
-  void update_device_memory();
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
deleted file mode 100644
index 3929cf77f15..00000000000
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device_intern.h"
-#  include "device/opencl/device_opencl.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_semaphore.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-
-using std::cerr;
-using std::endl;
-
-CCL_NAMESPACE_BEGIN
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
-    : program(rhs.program), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
-{
-  delete mutex;
-}
-
-OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
-{
-}
-
-OpenCLCache::Slot::Slot(const Slot &rhs)
-    : context_mutex(NULL), context(NULL), programs(rhs.programs)
-{
-}
-
-OpenCLCache::Slot::~Slot()
-{
-  delete context_mutex;
-}
-
-OpenCLCache &OpenCLCache::global_instance()
-{
-  static OpenCLCache instance;
-  return instance;
-}
-
-cl_context OpenCLCache::get_context(cl_platform_id platform,
-                                    cl_device_id device,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!slot.context_mutex)
-    slot.context_mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*slot.context_mutex);
-
-  /* If the thing isn't cached */
-  if (slot.context == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainContext(slot.context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return slot.context;
-}
-
-cl_program OpenCLCache::get_program(cl_platform_id platform,
-                                    cl_device_id device,
-                                    ustring key,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
-      Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
-
-  Slot::ProgramEntry &entry = ins2.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!entry.mutex)
-    entry.mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*entry.mutex);
-
-  /* If the thing isn't cached */
-  if (entry.program == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainProgram(entry.program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return entry.program;
-}
-
-void OpenCLCache::store_context(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_context context,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(context != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  cache_lock.unlock();
-
-  Slot &slot = i->second;
-
-  /* sanity check */
-  assert(i != self.cache.end());
-  assert(slot.context == NULL);
-
-  slot.context = context;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* increment reference count in OpenCL.
-   * The caller is going to release the object when done with it. */
-  cl_int ciErr = clRetainContext(context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-void OpenCLCache::store_program(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_program program,
-                                ustring key,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(program != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  assert(i != self.cache.end());
-  Slot &slot = i->second;
-
-  Slot::EntryMap::iterator i2 = slot.programs.find(key);
-  assert(i2 != slot.programs.end());
-  Slot::ProgramEntry &entry = i2->second;
-
-  assert(entry.program == NULL);
-
-  cache_lock.unlock();
-
-  entry.program = program;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* Increment reference count in OpenCL.
-   * The caller is going to release the object when done with it.
-   */
-  cl_int ciErr = clRetainProgram(program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-string OpenCLCache::get_kernel_md5()
-{
-  OpenCLCache &self = global_instance();
-  thread_scoped_lock lock(self.kernel_md5_lock);
-
-  if (self.kernel_md5.empty()) {
-    self.kernel_md5 = path_files_md5_hash(path_get("source"));
-  }
-  return self.kernel_md5;
-}
-
-static string get_program_source(const string &kernel_file)
-{
-  string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-  /* We compile kernels consisting of many files. unfortunately OpenCL
-   * kernel caches do not seem to recognize changes in included files.
-   * so we force recompile on changes by adding the md5 hash of all files.
-   */
-  source = path_source_replace_includes(source, path_get("source"));
-  source += "\n// " + util_md5_string(source) + "\n";
-  return source;
-}
-
-OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
-                                           const string &program_name,
-                                           const string &kernel_file,
-                                           const string &kernel_build_options,
-                                           bool use_stdout)
-    : device(device),
-      program_name(program_name),
-      kernel_file(kernel_file),
-      kernel_build_options(kernel_build_options),
-      use_stdout(use_stdout)
-{
-  loaded = false;
-  needs_compiling = true;
-  program = NULL;
-}
-
-OpenCLDevice::OpenCLProgram::~OpenCLProgram()
-{
-  release();
-}
-
-void OpenCLDevice::OpenCLProgram::release()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    if (kernel->second) {
-      clReleaseKernel(kernel->second);
-      kernel->second = NULL;
-    }
-  }
-  if (program) {
-    clReleaseProgram(program);
-    program = NULL;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
-{
-  if (!use_stdout) {
-    log += msg + "\n";
-  }
-  else if (!debug) {
-    printf("%s\n", msg.c_str());
-    fflush(stdout);
-  }
-  else {
-    VLOG(2) << msg;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
-{
-  if (use_stdout) {
-    fprintf(stderr, "%s\n", msg.c_str());
-  }
-  if (error_msg == "") {
-    error_msg += "\n";
-  }
-  error_msg += msg;
-}
-
-void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
-{
-  if (!kernels.count(name)) {
-    kernels[name] = NULL;
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
-{
-  string build_options;
-  build_options = device->kernel_build_options(debug_src) + kernel_build_options;
-
-  VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
-  cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-  /* show warnings even if build is successful */
-  size_t ret_val_size = 0;
-
-  clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
-              ", errors in console.");
-  }
-
-  if (ret_val_size > 1) {
-    vector<char> build_log(ret_val_size + 1);
-    clGetProgramBuildInfo(
-        program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
-    build_log[ret_val_size] = '\0';
-    /* Skip meaningless empty output from the NVidia compiler. */
-    if (!(ret_val_size == 2 && build_log[0] == '\n')) {
-      add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
-              ciErr == CL_SUCCESS);
-    }
-  }
-
-  return (ciErr == CL_SUCCESS);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
-{
-  string source = get_program_source(kernel_file);
-
-  if (debug_src) {
-    path_write_text(*debug_src, source);
-  }
-
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_int ciErr;
-
-  program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
-    return false;
-  }
-
-  double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  double elapsed = time_dt() - starttime;
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return true;
-}
-
-static void escape_python_string(string &str)
-{
-  /* Escape string to be passed as a Python raw string with '' quotes'. */
-  string_replace(str, "'", "\'");
-}
-
-static int opencl_compile_process_limit()
-{
-  /* Limit number of concurrent processes compiling, with a heuristic based
-   * on total physical RAM and estimate of memory usage needed when compiling
-   * with all Cycles features enabled.
-   *
-   * This is somewhat arbitrary as we don't know the actual available RAM or
-   * how much the kernel compilation will needed depending on the features, but
-   * better than not limiting at all. */
-  static const int64_t GB = 1024LL * 1024LL * 1024LL;
-  static const int64_t process_memory = 2 * GB;
-  static const int64_t base_memory = 2 * GB;
-  static const int64_t system_memory = system_physical_ram();
-  static const int64_t process_limit = (system_memory - base_memory) / process_memory;
-
-  return max((int)process_limit, 1);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
-{
-  /* Construct arguments. */
-  vector<string> args;
-  args.push_back("--background");
-  args.push_back("--factory-startup");
-  args.push_back("--python-expr");
-
-  int device_platform_id = device->device_num;
-  string device_name = device->device_name;
-  string platform_name = device->platform_name;
-  string build_options = device->kernel_build_options(NULL) + kernel_build_options;
-  string kernel_file_escaped = kernel_file;
-  string clbin_escaped = clbin;
-
-  escape_python_string(device_name);
-  escape_python_string(platform_name);
-  escape_python_string(build_options);
-  escape_python_string(kernel_file_escaped);
-  escape_python_string(clbin_escaped);
-
-  args.push_back(string_printf(
-      "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
-      device_platform_id,
-      device_name.c_str(),
-      platform_name.c_str(),
-      build_options.c_str(),
-      kernel_file_escaped.c_str(),
-      clbin_escaped.c_str()));
-
-  /* Limit number of concurrent processes compiling. */
-  static thread_counting_semaphore semaphore(opencl_compile_process_limit());
-  semaphore.acquire();
-
-  /* Compile. */
-  const double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-  const bool success = system_call_self(args);
-  const double elapsed = time_dt() - starttime;
-
-  semaphore.release();
-
-  if (!success || !path_exists(clbin)) {
-    return false;
-  }
-
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return load_binary(clbin);
-}
-
-/* Compile opencl kernel. This method is called from the _cycles Python
- * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string> &parameters)
-{
-  int device_platform_id = std::stoi(parameters[0]);
-  const string &device_name = parameters[1];
-  const string &platform_name = parameters[2];
-  const string &build_options = parameters[3];
-  const string &kernel_file = parameters[4];
-  const string &binary_path = parameters[5];
-
-  if (clewInit() != CLEW_SUCCESS) {
-    return false;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (device_platform_id >= usable_devices.size()) {
-    return false;
-  }
-
-  OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
-  if (platform_device.platform_name != platform_name ||
-      platform_device.device_name != device_name) {
-    return false;
-  }
-
-  cl_platform_id platform = platform_device.platform_id;
-  cl_device_id device = platform_device.device_id;
-  const cl_context_properties context_props[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
-
-  cl_int err;
-  cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
-  if (err != CL_SUCCESS) {
-    return false;
-  }
-
-  string source = get_program_source(kernel_file);
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
-  bool result = false;
-
-  if (err == CL_SUCCESS) {
-    err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-    if (err == CL_SUCCESS) {
-      size_t size = 0;
-      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-      if (size > 0) {
-        vector<uint8_t> binary(size);
-        uint8_t *bytes = &binary[0];
-        clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-        result = path_write_binary(binary_path, binary);
-      }
-    }
-    clReleaseProgram(program);
-  }
-
-  clReleaseContext(context);
-
-  return result;
-}
-
-bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
-{
-  /* read binary into memory */
-  vector<uint8_t> binary;
-
-  if (!path_read_binary(clbin, binary)) {
-    add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
-    return false;
-  }
-
-  /* create program */
-  cl_int status, ciErr;
-  size_t size = binary.size();
-  const uint8_t *bytes = &binary[0];
-
-  program = clCreateProgramWithBinary(
-      device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
-
-  if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
-              clewErrorString(status) + " " + clewErrorString(ciErr));
-    return false;
-  }
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  return true;
-}
-
-bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
-{
-  size_t size = 0;
-  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
-  if (!size)
-    return false;
-
-  vector<uint8_t> binary(size);
-  uint8_t *bytes = &binary[0];
-
-  clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-
-  return path_write_binary(clbin, binary);
-}
-
-bool OpenCLDevice::OpenCLProgram::load()
-{
-  loaded = false;
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-  if (!program) {
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* If binary kernel exists already, try use it. */
-    if (path_exists(clbin) && load_binary(clbin)) {
-      /* Kernel loaded from binary, nothing to do. */
-      add_log(string("Loaded program from ") + clbin + ".", true);
-
-      /* Cache the program. */
-      device->store_cached_kernel(program, cache_key, cache_locker);
-    }
-    else {
-      add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
-      cache_locker.unlock();
-    }
-  }
-
-  if (program) {
-    create_kernels();
-    loaded = true;
-    needs_compiling = false;
-  }
-
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::compile()
-{
-  assert(device);
-
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-
-  if (!program) {
-
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* path to preprocessed source for debugging */
-    string clsrc, *debug_src = NULL;
-
-    if (OpenCLInfo::use_debug()) {
-      clsrc = basename + ".cl";
-      debug_src = &clsrc;
-    }
-
-    if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
-      add_log(string("Built and loaded program from ") + clbin + ".", true);
-      loaded = true;
-    }
-    else {
-      if (DebugFlags().running_inside_blender) {
-        add_log(string("Separate-process building of ") + clbin +
-                    " failed, will fall back to regular building.",
-                true);
-      }
-
-      /* If does not exist or loading binary failed, compile kernel. */
-      if (!compile_kernel(debug_src)) {
-        needs_compiling = false;
-        return;
-      }
-
-      /* Save binary for reuse. */
-      if (!save_binary(clbin)) {
-        add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
-      }
-    }
-
-    /* Cache the program. */
-    device->store_cached_kernel(program, cache_key, cache_locker);
-  }
-
-  create_kernels();
-  needs_compiling = false;
-  loaded = true;
-}
-
-void OpenCLDevice::OpenCLProgram::create_kernels()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    assert(kernel->second == NULL);
-    cl_int ciErr;
-    string name = "kernel_ocl_" + kernel->first.string();
-    kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
-    if (device->opencl_error(ciErr)) {
-      add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
-                clewErrorString(ciErr));
-      return;
-    }
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::wait_for_availability()
-{
-  add_log(string("Waiting for availability of ") + program_name + ".", true);
-  while (needs_compiling) {
-    time_sleep(0.1);
-  }
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::report_error()
-{
-  /* If loaded is true, there was no error. */
-  if (loaded)
-    return;
-  /* if use_stdout is true, the error was already reported. */
-  if (use_stdout)
-    return;
-
-  cerr << error_msg << endl;
-  if (!compile_output.empty()) {
-    cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
-    cerr << compile_output << endl;
-  }
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()()
-{
-  assert(kernels.size() == 1);
-  return kernels.begin()->second;
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
-{
-  assert(kernels.count(name));
-  return kernels[name];
-}
-
-cl_device_type OpenCLInfo::device_type()
-{
-  switch (DebugFlags().opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      return 0;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      return CL_DEVICE_TYPE_ALL;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      return CL_DEVICE_TYPE_DEFAULT;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      return CL_DEVICE_TYPE_CPU;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      return CL_DEVICE_TYPE_GPU;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      return CL_DEVICE_TYPE_ACCELERATOR;
-    default:
-      return CL_DEVICE_TYPE_ALL;
-  }
-}
-
-bool OpenCLInfo::use_debug()
-{
-  return DebugFlags().opencl.debug;
-}
-
-bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return false;
-  }
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return false;
-  }
-
-  int driver_major = 0;
-  int driver_minor = 0;
-  if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
-    return false;
-  }
-  VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
-  if (getenv("CYCLES_OPENCL_TEST")) {
-    return true;
-  }
-
-  /* Allow Intel GPUs on Intel OpenCL platform. */
-  if (platform_name.find("Intel") != string::npos) {
-    if (device_type != CL_DEVICE_TYPE_GPU) {
-      /* OpenCL on Intel CPU is not an officially supported configuration.
-       * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */
-      return false;
-    }
-
-#  ifdef __APPLE__
-    /* Apple uses own framework, which can also put Iris onto AMD frame-work.
-     * This isn't supported configuration. */
-    return false;
-#  else
-    if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) {
-      return true;
-    }
-#  endif
-  }
-
-  if (platform_name == "AMD Accelerated Parallel Processing" &&
-      device_type == CL_DEVICE_TYPE_GPU) {
-    if (driver_major < 2236) {
-      VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
-      return false;
-    }
-    const char *blacklist[] = {/* GCN 1 */
-                               "Tahiti",
-                               "Pitcairn",
-                               "Capeverde",
-                               "Oland",
-                               "Hainan",
-                               NULL};
-    for (int i = 0; blacklist[i] != NULL; i++) {
-      if (device_name == blacklist[i]) {
-        VLOG(1) << "AMD device " << device_name << " not supported";
-        return false;
-      }
-    }
-    return true;
-  }
-  if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
-    return false;
-  }
-  return false;
-}
-
-bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  char version[256];
-  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
-    }
-    return false;
-  }
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf(
-          "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
-{
-  char version[256];
-  clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  if (!get_device_version(device, &major, &minor, error)) {
-    return false;
-  }
-
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
-{
-  if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
-    /* Use cl_amd_device_topology extension. */
-    cl_char topology[24];
-    if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
-        topology[0] == 1) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)topology[21],
-                           (unsigned int)topology[22],
-                           (unsigned int)topology[23]);
-    }
-  }
-  else if (platform_name == "NVIDIA CUDA") {
-    /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
-    cl_int bus_id, slot_id;
-    if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
-        clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)(bus_id),
-                           (unsigned int)(slot_id >> 3),
-                           (unsigned int)(slot_id & 0x7));
-    }
-  }
-  /* No general way to get a hardware ID from OpenCL => give up. */
-  return "";
-}
-
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
-  const cl_device_type device_type = OpenCLInfo::device_type();
-  static bool first_time = true;
-#  define FIRST_VLOG(severity) \
-    if (first_time) \
-    VLOG(severity)
-
-  usable_devices->clear();
-
-  if (device_type == 0) {
-    FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
-    first_time = false;
-    return;
-  }
-
-  cl_int error;
-  vector<cl_device_id> device_ids;
-  vector<cl_platform_id> platform_ids;
-
-  /* Get platforms. */
-  if (!get_platforms(&platform_ids, &error)) {
-    FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
-    first_time = false;
-    return;
-  }
-  if (platform_ids.size() == 0) {
-    FIRST_VLOG(2) << "No OpenCL platforms were found.";
-    first_time = false;
-    return;
-  }
-  /* Devices are numbered consecutively across platforms. */
-  for (int platform = 0; platform < platform_ids.size(); platform++) {
-    cl_platform_id platform_id = platform_ids[platform];
-    string platform_name;
-    if (!get_platform_name(platform_id, &platform_name)) {
-      FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
-      continue;
-    }
-    FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
-    if (!platform_version_check(platform_id)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << " due to too old compiler version.";
-      continue;
-    }
-    if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << ", failed to fetch of devices: " << string(clewErrorString(error));
-      continue;
-    }
-    if (device_ids.size() == 0) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
-      continue;
-    }
-    for (int num = 0; num < device_ids.size(); num++) {
-      const cl_device_id device_id = device_ids[num];
-      string device_name;
-      if (!get_device_name(device_id, &device_name, &error)) {
-        FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
-                      << ", ignoring.";
-        continue;
-      }
-      if (!device_version_check(device_id)) {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
-        continue;
-      }
-      if (device_supported(platform_name, device_id)) {
-        cl_device_type device_type;
-        if (!get_device_type(device_id, &device_type, &error)) {
-          FIRST_VLOG(2) << "Ignoring device " << device_name
-                        << ", failed to fetch device type:" << string(clewErrorString(error));
-          continue;
-        }
-        string readable_device_name = get_readable_device_name(device_id);
-        if (readable_device_name != device_name) {
-          FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
-        }
-        FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
-        string hardware_id = get_hardware_id(platform_name, device_id);
-        string device_extensions = get_device_extensions(device_id);
-        usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-                                                       platform_name,
-                                                       device_id,
-                                                       device_type,
-                                                       readable_device_name,
-                                                       hardware_id,
-                                                       device_extensions));
-      }
-      else {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
-      }
-    }
-  }
-  first_time = false;
-}
-
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
-{
-  /* Reset from possible previous state. */
-  platform_ids->resize(0);
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms, error)) {
-    return false;
-  }
-  /* Get actual platforms. */
-  cl_int err;
-  platform_ids->resize(num_platforms);
-  if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_platform_id> OpenCLInfo::get_platforms()
-{
-  vector<cl_platform_id> platform_ids;
-  get_platforms(&platform_ids);
-  return platform_ids;
-}
-
-bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_platforms = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platforms()
-{
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms)) {
-    return 0;
-  }
-  return num_platforms;
-}
-
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
-{
-  char buffer[256];
-  if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
-      CL_SUCCESS) {
-    *platform_name = "";
-    return false;
-  }
-  *platform_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
-{
-  string platform_name;
-  if (!get_platform_name(platform_id, &platform_name)) {
-    return "";
-  }
-  return platform_name;
-}
-
-bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                          cl_device_type device_type,
-                                          cl_uint *num_devices,
-                                          cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_devices = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                             cl_device_type device_type)
-{
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
-    return 0;
-  }
-  return num_devices;
-}
-
-bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                      cl_device_type device_type,
-                                      vector<cl_device_id> *device_ids,
-                                      cl_int *error)
-{
-  /* Reset from possible previous state. */
-  device_ids->resize(0);
-  /* Get number of devices to pre-allocate memory. */
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
-    return false;
-  }
-  /* Get actual device list. */
-  device_ids->resize(num_devices);
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                                      cl_device_type device_type)
-{
-  vector<cl_device_id> devices;
-  get_platform_devices(platform_id, device_type, &devices);
-  return devices;
-}
-
-bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_name = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_device_name(cl_device_id device_id)
-{
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return "";
-  }
-  return device_name;
-}
-
-bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
-                                       string *device_extensions,
-                                       cl_int *error)
-{
-  size_t extension_length = 0;
-  cl_int err;
-  /* Determine the size of the extension string. */
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  vector<char> buffer(extension_length);
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_extensions = string(buffer.data());
-  return true;
-}
-
-string OpenCLInfo::get_device_extensions(cl_device_id device_id)
-{
-  string device_extensions;
-  if (!get_device_extensions(device_id, &device_extensions)) {
-    return "";
-  }
-  return device_extensions;
-}
-
-bool OpenCLInfo::get_device_type(cl_device_id device_id,
-                                 cl_device_type *device_type,
-                                 cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_type = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return 0;
-  }
-  return device_type;
-}
-
-string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
-{
-  string name = "";
-  char board_name[1024];
-  size_t length = 0;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
-      CL_SUCCESS) {
-    if (length != 0 && board_name[0] != '\0') {
-      name = board_name;
-    }
-  }
-
-  /* Fallback to standard device name API. */
-  if (name.empty()) {
-    name = get_device_name(device_id);
-  }
-
-  /* Special exception for AMD Vega, need to be able to tell
-   * Vega 56 from 64 apart.
-   */
-  if (name == "Radeon RX Vega") {
-    cl_int max_compute_units = 0;
-    if (clGetDeviceInfo(device_id,
-                        CL_DEVICE_MAX_COMPUTE_UNITS,
-                        sizeof(max_compute_units),
-                        &max_compute_units,
-                        NULL) == CL_SUCCESS) {
-      name += " " + to_string(max_compute_units);
-    }
-  }
-
-  /* Distinguish from our native CPU device. */
-  if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
-    name += " (OpenCL)";
-  }
-
-  return name;
-}
-
-bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  if (sscanf(buffer, "%d.%d", major, minor) < 2) {
-    VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
-    return false;
-  }
-  return true;
-}
-
-int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
-{
-  int base_align_bits;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
-      CL_SUCCESS) {
-    return base_align_bits / 8;
-  }
-  return 1;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
new file mode 100644
index 00000000000..13f23bd229a
--- /dev/null
+++ b/intern/cycles/device/optix/device.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/optix/device.h"
+
+#include "device/cuda/device.h"
+#include "device/optix/device_impl.h"
+#include "util/util_logging.h"
+
+#ifdef WITH_OPTIX
+#  include <optix_function_table_definition.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+bool device_optix_init()
+{
+#ifdef WITH_OPTIX
+  if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
+    /* Already initialized function table. */
+    return true;
+  }
+
+  /* Need to initialize CUDA as well. */
+  if (!device_cuda_init()) {
+    return false;
+  }
+
+  const OptixResult result = optixInit();
+
+  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+               "Please update to the latest driver first!";
+    return false;
+  }
+  else if (result != OPTIX_SUCCESS) {
+    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+    return false;
+  }
+
+  /* Loaded OptiX successfully! */
+  return true;
+#else
+  return false;
+#endif
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+#ifdef WITH_OPTIX
+  devices.reserve(cuda_devices.size());
+
+  /* Simply add all supported CUDA devices as OptiX devices again. */
+  for (DeviceInfo info : cuda_devices) {
+    assert(info.type == DEVICE_CUDA);
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+    if (major < 5) {
+      /* Only Maxwell and up are supported by OptiX. */
+      continue;
+    }
+
+    info.type = DEVICE_OPTIX;
+    info.id += "_OptiX";
+    info.denoisers |= DENOISER_OPTIX;
+
+    devices.push_back(info);
+  }
+#else
+  (void)cuda_devices;
+  (void)devices;
+#endif
+}
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_OPTIX
+  return new OptiXDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h
new file mode 100644
index 00000000000..29fa729c2e4
--- /dev/null
+++ b/intern/cycles/device/optix/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_optix_init();
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..cd16b8c9f01
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/device_impl.h"
+
+#  include "bvh/bvh.h"
+#  include "bvh/bvh_optix.h"
+#  include "integrator/pass_accessor_gpu.h"
+#  include "render/buffers.h"
+#  include "render/hair.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "render/pass.h"
+#  include "render/scene.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_progress.h"
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+    : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+  const CUDAContextScope scope(device);
+  if (optix_denoiser != nullptr) {
+    optixDenoiserDestroy(optix_denoiser);
+  }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : CUDADevice(info, stats, profiler),
+      sbt_data(this, "__sbt", MEM_READ_ONLY),
+      launch_params(this, "__params"),
+      denoiser_(this)
+{
+  /* Make the CUDA context current. */
+  if (!cuContext) {
+    /* Do not initialize if CUDA context creation failed already. */
+    return;
+  }
+  const CUDAContextScope scope(this);
+
+  /* Create OptiX context for this device. */
+  OptixDeviceContextOptions options = {};
+#  ifdef WITH_CYCLES_LOGGING
+  options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+  options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+    switch (level) {
+      case 1:
+        LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+        break;
+      case 2:
+        LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+        break;
+      case 3:
+        LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+        break;
+      case 4:
+        LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+        break;
+    }
+  };
+#  endif
+  if (DebugFlags().optix.use_debug) {
+    options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+  }
+  optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+#  ifdef WITH_CYCLES_LOGGING
+  optix_assert(optixDeviceContextSetLogCallback(
+      context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+#  endif
+
+  /* Fix weird compiler bug that assigns wrong size. */
+  launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+  /* Allocate launch parameter buffer memory on device. */
+  launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+  /* Make CUDA context current. */
+  const CUDAContextScope scope(this);
+
+  free_bvh_memory_delayed();
+
+  sbt_data.free();
+  texture_info.free();
+  launch_params.free();
+
+  /* Unload modules. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+    }
+  }
+
+  optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+  return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+  /* OptiX has its own internal acceleration structure format. */
+  return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+  /* Add OptiX SDK include directory to include paths. */
+  const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+  if (optix_sdk_path) {
+    common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+  }
+
+  /* Specialization for shader raytracing. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    common_cflags += " --keep-device-functions";
+  }
+
+  return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+  if (have_error()) {
+    /* Abort early if context creation failed already. */
+    return false;
+  }
+
+  /* Load CUDA modules because we need some of the utility kernels. */
+  if (!CUDADevice::load_kernels(kernel_features)) {
+    return false;
+  }
+
+  /* Skip creating OptiX module if only doing denoising. */
+  if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+    return true;
+  }
+
+  const CUDAContextScope scope(this);
+
+  /* Unload existing OptiX module and pipelines first. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+    optix_module = NULL;
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+      builtin_modules[i] = NULL;
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+      pipelines[i] = NULL;
+    }
+  }
+
+  OptixModuleCompileOptions module_options = {};
+  module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+  if (DebugFlags().optix.use_debug) {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  module_options.boundValues = nullptr;
+  module_options.numBoundValues = 0;
+
+  OptixPipelineCompileOptions pipeline_options = {};
+  /* Default to no motion blur and two-level graph, since it is the fastest option. */
+  pipeline_options.usesMotionBlur = false;
+  pipeline_options.traversableGraphFlags =
+      OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+  pipeline_options.numPayloadValues = 6;
+  pipeline_options.numAttributeValues = 2; /* u, v */
+  pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+  pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+  pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+    }
+    else
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+  }
+
+  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+   * This is necessary since objects may be reported to have motion if the Vector pass is
+   * active, but may still need to be rendered without motion blur if that isn't active as well. */
+  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+  if (motion_blur) {
+    pipeline_options.usesMotionBlur = true;
+    /* Motion blur can insert motion transforms into the traversal graph.
+     * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+    pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+  }
+
+  { /* Load and compile PTX module with OptiX kernels. */
+    string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+                                                 "lib/kernel_optix_shader_raytrace.ptx" :
+                                                 "lib/kernel_optix.ptx");
+    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+      if (!getenv("OPTIX_ROOT_DIR")) {
+        set_error(
+            "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+            "the Optix SDK to be able to compile Optix kernels on demand).");
+        return false;
+      }
+      ptx_filename = compile_kernel(
+          kernel_features,
+          (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+          "optix",
+          true);
+    }
+    if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &optix_module);
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+  }
+
+  /* Create program groups. */
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_closest";
+  group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_shadow";
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_subsurface";
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_volume_stack";
+  group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+  group_descs[PG_MISS].miss.module = optix_module;
+  group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+  group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+  group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+  group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      /* Built-in thick curve intersection. */
+      OptixBuiltinISOptions builtin_options = {};
+      builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+      builtin_options.usesMotionBlur = false;
+
+      optix_assert(optixBuiltinISModuleGet(
+          context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+      group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+      group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+      if (motion_blur) {
+        builtin_options.usesMotionBlur = true;
+
+        optix_assert(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+        group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+        group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+        group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+      }
+    }
+    else {
+      /* Custom ribbon intersection. */
+      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+    }
+  }
+
+  if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+    /* Add hit group for local intersections. */
+    group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+  }
+
+  /* Shader raytracing replaces some functions with direct callables. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+        "__direct_callable__svm_node_bevel";
+    group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+  }
+
+  optix_assert(optixProgramGroupCreate(
+      context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+  /* Get program stack sizes. */
+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+  /* Set up SBT, which in this case is used only to select between different programs. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS);
+  memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+  }
+  sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+  /* Calculate maximum trace continuation stack size. */
+  unsigned int trace_css = stack_size[PG_HITD].cssCH;
+  /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+  trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 1;
+
+  if (DebugFlags().optix.use_debug) {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    /* Create shader raytracing pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_SHADE_RAYTRACE]));
+
+    /* Combine ray generation and trace continuation stack size. */
+    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+                             link_options.maxTraceDepth * trace_css;
+    const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+                                      stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+    /* Set stack size depending on pipeline options. */
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+  }
+
+  { /* Create intersection-only pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_INTERSECT]));
+
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
+
+    optix_assert(
+        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+  }
+
+  /* Clean up program group objects. */
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optixProgramGroupDestroy(groups[i]);
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+  explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+      : denoise_params(task.params),
+        render_buffers(task.render_buffers),
+        buffer_params(task.buffer_params),
+        guiding_buffer(device, "denoiser guiding passes buffer"),
+        num_samples(task.num_samples)
+  {
+    num_input_passes = 1;
+    if (denoise_params.use_pass_albedo) {
+      num_input_passes += 1;
+      use_pass_albedo = true;
+      pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      if (denoise_params.use_pass_normal) {
+        num_input_passes += 1;
+        use_pass_normal = true;
+        pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+      }
+    }
+
+    const int num_guiding_passes = num_input_passes - 1;
+
+    if (num_guiding_passes) {
+      if (task.allow_inplace_modification) {
+        guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+        guiding_params.pass_albedo = pass_denoising_albedo;
+        guiding_params.pass_normal = pass_denoising_normal;
+
+        guiding_params.stride = buffer_params.stride;
+        guiding_params.pass_stride = buffer_params.pass_stride;
+      }
+      else {
+        guiding_params.pass_stride = 0;
+        if (use_pass_albedo) {
+          guiding_params.pass_albedo = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_normal) {
+          guiding_params.pass_normal = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+
+        guiding_params.stride = buffer_params.width;
+
+        guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+                                       guiding_params.pass_stride);
+        guiding_params.device_pointer = guiding_buffer.device_pointer;
+      }
+    }
+
+    pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  }
+
+  const DenoiseParams &denoise_params;
+
+  RenderBuffers *render_buffers = nullptr;
+  const BufferParams &buffer_params;
+
+  /* Device-side storage of the guiding passes. */
+  device_only_memory<float> guiding_buffer;
+
+  struct {
+    device_ptr device_pointer = 0;
+
+    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+    int pass_albedo = PASS_UNUSED;
+    int pass_normal = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } guiding_params;
+
+  /* Number of input passes. Including the color and extra auxillary passes. */
+  int num_input_passes = 0;
+  bool use_pass_albedo = false;
+  bool use_pass_normal = false;
+
+  int num_samples = 0;
+
+  int pass_sample_count = PASS_UNUSED;
+
+  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+  int pass_denoising_albedo = PASS_UNUSED;
+  int pass_denoising_normal = PASS_UNUSED;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+  DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+  {
+    noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+    denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  PassType type;
+
+  int noisy_offset;
+  int denoised_offset;
+
+  int num_components;
+  bool use_compositing;
+  bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+  const CUDAContextScope scope(this);
+
+  DenoiseContext context(this, task);
+
+  if (!denoise_ensure(context)) {
+    return false;
+  }
+
+  if (!denoise_filter_guiding_preprocess(context)) {
+    LOG(ERROR) << "Error preprocessing guiding passes.";
+    return false;
+  }
+
+  /* Passes which will use real albedo when it is available. */
+  denoise_pass(context, PASS_COMBINED);
+  denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+  /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+  denoise_pass(context, PASS_SHADOW_CATCHER);
+
+  return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+  return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&context.guiding_params.pass_normal),
+                  &context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&context.pass_denoising_albedo),
+                  const_cast<int *>(&context.pass_denoising_normal),
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&context.num_samples)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const DenoisePass pass(pass_type, buffer_params);
+
+  if (pass.noisy_offset == PASS_UNUSED) {
+    return;
+  }
+  if (pass.denoised_offset == PASS_UNUSED) {
+    LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+    return;
+  }
+
+  if (pass.use_denoising_albedo) {
+    if (context.albedo_replaced_with_fake) {
+      LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+      return;
+    }
+  }
+  else if (!context.albedo_replaced_with_fake) {
+    context.albedo_replaced_with_fake = true;
+    if (!denoise_filter_guiding_set_fake_albedo(context)) {
+      LOG(ERROR) << "Error replacing real albedo with the fake one.";
+      return;
+    }
+  }
+
+  /* Read and preprocess noisy color input pass. */
+  denoise_color_read(context, pass);
+  if (!denoise_filter_color_preprocess(context, pass)) {
+    LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+    return;
+  }
+
+  if (!denoise_run(context, pass)) {
+    LOG(ERROR) << "Error running OptiX denoiser.";
+    return;
+  }
+
+  /* Store result in the combined pass of the render buffer.
+   *
+   * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+  if (!denoise_filter_color_postprocess(context, pass)) {
+    LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+    return;
+  }
+
+  denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = pass.type;
+  pass_access_info.mode = PassMode::NOISY;
+  pass_access_info.offset = pass.noisy_offset;
+
+  /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+   * on the approximation. The latter is not even possible because OptiX does not support
+   * denoising of semi-transparent pixels. */
+  pass_access_info.use_approximate_shadow_catcher = false;
+  pass_access_info.use_approximate_shadow_catcher_background = false;
+  pass_access_info.show_active_pixels = false;
+
+  /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+   */
+  const PassAccessorGPU pass_accessor(
+      &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+  PassAccessor::Destination destination(pass_access_info.type);
+  destination.d_pixels = context.render_buffers->buffer.device_pointer +
+                         pass.denoised_offset * sizeof(float);
+  destination.num_components = 3;
+  destination.pixel_stride = context.buffer_params.pass_stride;
+
+  pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&pass.denoised_offset)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+                                                   const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.num_samples),
+                  const_cast<int *>(&pass.noisy_offset),
+                  const_cast<int *>(&pass.denoised_offset),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&pass.num_components),
+                  const_cast<bool *>(&pass.use_compositing)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+  if (!denoise_create_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser creation has failed.";
+    return false;
+  }
+
+  if (!denoise_configure_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser configuration has failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+  const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+                                 (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+                                 (denoiser_.use_pass_normal != context.use_pass_normal);
+  if (!recreate_denoiser) {
+    return true;
+  }
+
+  /* Destroy existing handle before creating new one. */
+  if (denoiser_.optix_denoiser) {
+    optixDenoiserDestroy(denoiser_.optix_denoiser);
+  }
+
+  /* Create OptiX denoiser handle on demand when it is first used. */
+  OptixDenoiserOptions denoiser_options = {};
+  denoiser_options.guideAlbedo = context.use_pass_albedo;
+  denoiser_options.guideNormal = context.use_pass_normal;
+  const OptixResult result = optixDenoiserCreate(
+      this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to create OptiX denoiser");
+    return false;
+  }
+
+  /* OptiX denoiser handle was created with the requested number of input passes. */
+  denoiser_.use_pass_albedo = context.use_pass_albedo;
+  denoiser_.use_pass_normal = context.use_pass_normal;
+
+  /* OptiX denoiser has been created, but it needs configuration. */
+  denoiser_.is_configured = false;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+  if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+                                  denoiser_.configured_size.y == context.buffer_params.height)) {
+    return true;
+  }
+
+  const BufferParams &buffer_params = context.buffer_params;
+
+  OptixDenoiserSizes sizes = {};
+  optix_assert(optixDenoiserComputeMemoryResources(
+      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+  denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+  /* Allocate denoiser state if tile size has changed since last setup. */
+  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+  /* Initialize denoiser state for the current tile size. */
+  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+                                                denoiser_.queue.stream(),
+                                                buffer_params.width,
+                                                buffer_params.height,
+                                                denoiser_.state.device_pointer,
+                                                denoiser_.scratch_offset,
+                                                denoiser_.state.device_pointer +
+                                                    denoiser_.scratch_offset,
+                                                denoiser_.scratch_size);
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to set up OptiX denoiser");
+    return false;
+  }
+
+  denoiser_.is_configured = true;
+  denoiser_.configured_size.x = buffer_params.width;
+  denoiser_.configured_size.y = buffer_params.height;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+  const int width = buffer_params.width;
+  const int height = buffer_params.height;
+
+  /* Set up input and output layer information. */
+  OptixImage2D color_layer = {0};
+  OptixImage2D albedo_layer = {0};
+  OptixImage2D normal_layer = {0};
+
+  OptixImage2D output_layer = {0};
+
+  /* Color pass. */
+  {
+    const int pass_denoised = pass.denoised_offset;
+    const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+    color_layer.data = context.render_buffers->buffer.device_pointer +
+                       pass_denoised * sizeof(float);
+    color_layer.width = width;
+    color_layer.height = height;
+    color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+    color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+  /* Optional albedo and color passes. */
+  if (context.num_input_passes > 1) {
+    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+    const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+    const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+    if (context.use_pass_albedo) {
+      albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+      albedo_layer.width = width;
+      albedo_layer.height = height;
+      albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+      albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_normal) {
+      normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+      normal_layer.width = width;
+      normal_layer.height = height;
+      normal_layer.rowStrideInBytes = row_stride_in_bytes;
+      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+  }
+
+  /* Denoise in-place of the noisy input in the render buffers. */
+  output_layer = color_layer;
+
+  /* Finally run denonising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.output = output_layer;
+
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;
+
+  optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+                                   denoiser_.queue.stream(),
+                                   &params,
+                                   denoiser_.state.device_pointer,
+                                   denoiser_.scratch_offset,
+                                   &guide_layers,
+                                   &image_layers,
+                                   1,
+                                   0,
+                                   0,
+                                   denoiser_.state.device_pointer + denoiser_.scratch_offset,
+                                   denoiser_.scratch_size));
+
+  return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+                                  OptixBuildOperation operation,
+                                  const OptixBuildInput &build_input,
+                                  uint16_t num_motion_steps)
+{
+  const CUDAContextScope scope(this);
+
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  /* Compute memory usage. */
+  OptixAccelBufferSizes sizes = {};
+  OptixAccelBuildOptions options = {};
+  options.operation = operation;
+  if (use_fast_trace_bvh) {
+    VLOG(2) << "Using fast to trace OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+  }
+  else {
+    VLOG(2) << "Using fast to update OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+  }
+
+  options.motionOptions.numKeys = num_motion_steps;
+  options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+  options.motionOptions.timeBegin = 0.0f;
+  options.motionOptions.timeEnd = 1.0f;
+
+  optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+  /* Allocate required output buffers. */
+  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+  if (!temp_mem.device_pointer) {
+    /* Make sure temporary memory allocation succeeded. */
+    return false;
+  }
+
+  device_only_memory<char> &out_data = bvh->as_data;
+  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+    assert(out_data.device == this);
+    out_data.alloc_to_device(sizes.outputSizeInBytes);
+    if (!out_data.device_pointer) {
+      return false;
+    }
+  }
+  else {
+    assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+  }
+
+  /* Finally build the acceleration structure. */
+  OptixAccelEmitDesc compacted_size_prop = {};
+  compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+  /* A tiny space was allocated for this property at the end of the temporary buffer above.
+   * Make sure this pointer is 8-byte aligned. */
+  compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+  OptixTraversableHandle out_handle = 0;
+  optix_assert(optixAccelBuild(context,
+                               NULL,
+                               &options,
+                               &build_input,
+                               1,
+                               temp_mem.device_pointer,
+                               sizes.tempSizeInBytes,
+                               out_data.device_pointer,
+                               sizes.outputSizeInBytes,
+                               &out_handle,
+                               use_fast_trace_bvh ? &compacted_size_prop : NULL,
+                               use_fast_trace_bvh ? 1 : 0));
+  bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+  /* Wait for all operations to finish. */
+  cuda_assert(cuStreamSynchronize(NULL));
+
+  /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+   */
+  if (use_fast_trace_bvh) {
+    uint64_t compacted_size = sizes.outputSizeInBytes;
+    cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+    /* Temporary memory is no longer needed, so free it now to make space. */
+    temp_mem.free();
+
+    /* There is no point compacting if the size does not change. */
+    if (compacted_size < sizes.outputSizeInBytes) {
+      device_only_memory<char> compacted_data(this, "optix compacted as");
+      compacted_data.alloc_to_device(compacted_size);
+      if (!compacted_data.device_pointer)
+        /* Do not compact if memory allocation for compacted acceleration structure fails.
+         * Can just use the uncompacted one then, so succeed here regardless. */
+        return !have_error();
+
+      optix_assert(optixAccelCompact(
+          context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+      bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+      /* Wait for compaction to finish. */
+      cuda_assert(cuStreamSynchronize(NULL));
+
+      std::swap(out_data.device_size, compacted_data.device_size);
+      std::swap(out_data.device_pointer, compacted_data.device_pointer);
+    }
+  }
+
+  return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  free_bvh_memory_delayed();
+
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  progress.set_substatus("Building OptiX acceleration structure");
+
+  if (!bvh->params.top_level) {
+    assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+    /* Refit is only possible in viewport for now (because AS is built with
+     * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+    OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+    if (refit && !use_fast_trace_bvh) {
+      assert(bvh_optix->traversable_handle != 0);
+      operation = OPTIX_BUILD_OPERATION_UPDATE;
+    }
+    else {
+      bvh_optix->as_data.free();
+      bvh_optix->traversable_handle = 0;
+    }
+
+    /* Build bottom level acceleration structures (BLAS). */
+    Geometry *const geom = bvh->geometry[0];
+    if (geom->geometry_type == Geometry::HAIR) {
+      /* Build BLAS for curve primitives. */
+      Hair *const hair = static_cast<Hair *const>(geom);
+      if (hair->num_curves() == 0) {
+        return;
+      }
+
+      const size_t num_segments = hair->num_segments();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = hair->get_motion_steps();
+      }
+
+      device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      /* Four control points for each curve segment. */
+      const size_t num_vertices = num_segments * 4;
+      if (hair->curve_shape == CURVE_THICK) {
+        index_data.alloc(num_segments);
+        vertex_data.alloc(num_vertices * num_motion_steps);
+      }
+      else
+        aabb_data.alloc(num_segments * num_motion_steps);
+
+      /* Get AABBs for each motion step. */
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        /* The center step for motion vertices is not stored in the attribute. */
+        const float3 *keys = hair->get_curve_keys().data();
+        size_t center_step = (num_motion_steps - 1) / 2;
+        if (step != center_step) {
+          size_t attr_offset = (step > center_step) ? step - 1 : step;
+          /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+          keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+        }
+
+        for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+          const Hair::Curve curve = hair->get_curve(j);
+          const array<float> &curve_radius = hair->get_curve_radius();
+
+          for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+            if (hair->curve_shape == CURVE_THICK) {
+              int k0 = curve.first_key + segment;
+              int k1 = k0 + 1;
+              int ka = max(k0 - 1, curve.first_key);
+              int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+              const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+              const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+              const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+              const float4 pw = make_float4(
+                  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+              /* Convert Catmull-Rom data to Bezier spline. */
+              static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+              static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+              static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+              static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+              index_data[i] = i * 4;
+              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+              v[0] = make_float4(
+                  dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+              v[1] = make_float4(
+                  dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+              v[2] = make_float4(
+                  dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+              v[3] = make_float4(
+                  dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+            }
+            else {
+              BoundBox bounds = BoundBox::empty;
+              curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+              const size_t index = step * num_segments + i;
+              aabb_data[index].minX = bounds.min.x;
+              aabb_data[index].minY = bounds.min.y;
+              aabb_data[index].minZ = bounds.min.z;
+              aabb_data[index].maxX = bounds.max.x;
+              aabb_data[index].maxY = bounds.max.y;
+              aabb_data[index].maxZ = bounds.max.z;
+            }
+          }
+        }
+      }
+
+      /* Upload AABB data to GPU. */
+      aabb_data.copy_to_device();
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      vector<device_ptr> width_ptrs;
+      vector<device_ptr> vertex_ptrs;
+      width_ptrs.reserve(num_motion_steps);
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+        const device_ptr base_ptr = vertex_data.device_pointer +
+                                    step * num_vertices * sizeof(float4);
+        width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+        vertex_ptrs.push_back(base_ptr);
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      if (hair->curve_shape == CURVE_THICK) {
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+        build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        build_input.curveArray.numPrimitives = num_segments;
+        build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+        build_input.curveArray.numVertices = num_vertices;
+        build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+        build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+        build_input.curveArray.widthStrideInBytes = sizeof(float4);
+        build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+        build_input.curveArray.indexStrideInBytes = sizeof(int);
+        build_input.curveArray.flag = build_flags;
+        build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+      else {
+        /* Disable visibility test any-hit program, since it is already checked during
+         * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+        build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+        build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+        build_input.customPrimitiveArray.numPrimitives = num_segments;
+        build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+        build_input.customPrimitiveArray.flags = &build_flags;
+        build_input.customPrimitiveArray.numSbtRecords = 1;
+        build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+    else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+      /* Build BLAS for triangle primitives. */
+      Mesh *const mesh = static_cast<Mesh *const>(geom);
+      if (mesh->num_triangles() == 0) {
+        return;
+      }
+
+      const size_t num_verts = mesh->get_verts().size();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = mesh->get_motion_steps();
+      }
+
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      index_data.alloc(mesh->get_triangles().size());
+      memcpy(index_data.data(),
+             mesh->get_triangles().data(),
+             mesh->get_triangles().size() * sizeof(int));
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      vertex_data.alloc(num_verts * num_motion_steps);
+
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        const float3 *verts = mesh->get_verts().data();
+
+        size_t center_step = (num_motion_steps - 1) / 2;
+        /* The center step for motion vertices is not stored in the attribute. */
+        if (step != center_step) {
+          verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+        }
+
+        memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+      }
+
+      /* Upload triangle data to GPU. */
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> vertex_ptrs;
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+      build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+      build_input.triangleArray.numVertices = num_verts;
+      build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+      build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+      build_input.triangleArray.indexBuffer = index_data.device_pointer;
+      build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+      build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+      build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+      build_input.triangleArray.flags = &build_flags;
+      /* The SBT does not store per primitive data since Cycles already allocates separate
+       * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+       * one and rely on that having the same meaning in this case. */
+      build_input.triangleArray.numSbtRecords = 1;
+      build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+  }
+  else {
+    unsigned int num_instances = 0;
+    unsigned int max_num_instances = 0xFFFFFFFF;
+
+    bvh_optix->as_data.free();
+    bvh_optix->traversable_handle = 0;
+    bvh_optix->motion_transform_data.free();
+
+    optixDeviceContextGetProperty(context,
+                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+                                  &max_num_instances,
+                                  sizeof(max_num_instances));
+    /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+    max_num_instances >>= 1;
+    if (bvh->objects.size() > max_num_instances) {
+      progress.set_error(
+          "Failed to build OptiX acceleration structure because there are too many instances");
+      return;
+    }
+
+    /* Fill instance descriptions. */
+    device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+    instances.alloc(bvh->objects.size());
+
+    /* Calculate total motion transform size and allocate memory for them. */
+    size_t motion_transform_offset = 0;
+    if (motion_blur) {
+      size_t total_motion_transform_size = 0;
+      for (Object *const ob : bvh->objects) {
+        if (ob->is_traceable() && ob->use_motion()) {
+          total_motion_transform_size = align_up(total_motion_transform_size,
+                                                 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+          const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+          total_motion_transform_size = total_motion_transform_size +
+                                        sizeof(OptixSRTMotionTransform) +
+                                        motion_keys * sizeof(OptixSRTData);
+        }
+      }
+
+      assert(bvh_optix->motion_transform_data.device == this);
+      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+    }
+
+    for (Object *ob : bvh->objects) {
+      /* Skip non-traceable objects. */
+      if (!ob->is_traceable()) {
+        continue;
+      }
+
+      BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+      OptixTraversableHandle handle = blas->traversable_handle;
+
+      OptixInstance &instance = instances[num_instances++];
+      memset(&instance, 0, sizeof(instance));
+
+      /* Clear transform to identity matrix. */
+      instance.transform[0] = 1.0f;
+      instance.transform[5] = 1.0f;
+      instance.transform[10] = 1.0f;
+
+      /* Set user instance ID to object index (but leave low bit blank). */
+      instance.instanceId = ob->get_device_index() << 1;
+
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      instance.visibilityMask = 1;
+
+      if (ob->get_geometry()->has_volume) {
+        /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+         */
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+        /* Same applies to curves (so they can be skipped in local trace calls). */
+        instance.visibilityMask |= 4;
+
+        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+          /* Select between motion blur and non-motion blur built-in intersection module. */
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+      }
+
+      /* Insert motion traversable if object has motion. */
+      if (motion_blur && ob->use_motion()) {
+        size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                       motion_keys * sizeof(OptixSRTData);
+
+        const CUDAContextScope scope(this);
+
+        motion_transform_offset = align_up(motion_transform_offset,
+                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+                                           motion_transform_offset;
+        motion_transform_offset += motion_transform_size;
+
+        /* Allocate host side memory for motion transform and fill it with transform data. */
+        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+            new uint8_t[motion_transform_size]);
+        motion_transform.child = handle;
+        motion_transform.motionOptions.numKeys = ob->get_motion().size();
+        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+        motion_transform.motionOptions.timeBegin = 0.0f;
+        motion_transform.motionOptions.timeEnd = 1.0f;
+
+        OptixSRTData *const srt_data = motion_transform.srtData;
+        array<DecomposedTransform> decomp(ob->get_motion().size());
+        transform_motion_decompose(
+            decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+        for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+          /* Scale. */
+          srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+          srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+          srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+          /* Shear. */
+          srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+          srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+          srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+          assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+          assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+          assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+          /* Pivot point. */
+          srt_data[i].pvx = 0.0f;
+          srt_data[i].pvy = 0.0f;
+          srt_data[i].pvz = 0.0f;
+
+          /* Rotation. */
+          srt_data[i].qx = decomp[i].x.x;
+          srt_data[i].qy = decomp[i].x.y;
+          srt_data[i].qz = decomp[i].x.z;
+          srt_data[i].qw = decomp[i].x.w;
+
+          /* Translation. */
+          srt_data[i].tx = decomp[i].y.x;
+          srt_data[i].ty = decomp[i].y.y;
+          srt_data[i].tz = decomp[i].y.z;
+        }
+
+        /* Upload motion transform to GPU. */
+        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+        /* Disable instance transform if object uses motion transform already. */
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+        /* Get traversable handle to motion transform. */
+        optixConvertPointerToTraversableHandle(context,
+                                               motion_transform_gpu,
+                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                               &instance.traversableHandle);
+      }
+      else {
+        instance.traversableHandle = handle;
+
+        if (ob->get_geometry()->is_instanced()) {
+          /* Set transform matrix. */
+          memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+        }
+        else {
+          /* Disable instance transform if geometry already has it applied to vertex data. */
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          /* Non-instanced objects read ID from 'prim_object', so distinguish
+           * them from instanced objects with the low bit set. */
+          instance.instanceId |= 1;
+        }
+      }
+    }
+
+    /* Upload instance descriptions. */
+    instances.resize(num_instances);
+    instances.copy_to_device();
+
+    /* Build top-level acceleration structure (TLAS) */
+    OptixBuildInput build_input = {};
+    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+    build_input.instanceArray.instances = instances.device_pointer;
+    build_input.instanceArray.numInstances = num_instances;
+
+    if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+      progress.set_error("Failed to build OptiX acceleration structure");
+    }
+    tlas_handle = bvh_optix->traversable_handle;
+  }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+   * while GPU is still rendering. */
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+  bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  /* Set constant memory for CUDA module. */
+  CUDADevice::const_copy_to(name, host, size);
+
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    /* Update traversable handle (since it is different for each device on multi devices). */
+    KernelData *const data = (KernelData *)host;
+    *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+    update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+    return;
+  }
+
+  /* Update data storage pointers in launch parameters. */
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+      return; \
+    }
+  KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
new file mode 100644
index 00000000000..742ae0f1bab
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/device_impl.h"
+#  include "device/optix/queue.h"
+#  include "device/optix/util.h"
+#  include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX;
+struct KernelParamsOptiX;
+
+/* List of OptiX program groups. */
+enum {
+  PG_RGEN_INTERSECT_CLOSEST,
+  PG_RGEN_INTERSECT_SHADOW,
+  PG_RGEN_INTERSECT_SUBSURFACE,
+  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_SURFACE_RAYTRACE,
+  PG_MISS,
+  PG_HITD, /* Default hit group. */
+  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
+  PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+  PG_HITD_MOTION,
+  PG_HITS_MOTION,
+  PG_CALL_SVM_AO,
+  PG_CALL_SVM_BEVEL,
+  PG_CALL_AO_PASS,
+  NUM_PROGRAM_GROUPS
+};
+
+static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
+static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
+static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+
+/* List of OptiX pipelines. */
+enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+
+/* A single shader binding table entry. */
+struct SbtRecord {
+  char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+};
+
+class OptiXDevice : public CUDADevice {
+ public:
+  OptixDeviceContext context = NULL;
+
+  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
+  OptixModule builtin_modules[2] = {};
+  OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+  bool motion_blur = false;
+  device_vector<SbtRecord> sbt_data;
+  device_only_memory<KernelParamsOptiX> launch_params;
+  OptixTraversableHandle tlas_handle = 0;
+
+  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  thread_mutex delayed_free_bvh_mutex;
+
+  class Denoiser {
+   public:
+    explicit Denoiser(OptiXDevice *device);
+    ~Denoiser();
+
+    OptiXDevice *device;
+    OptiXDeviceQueue queue;
+
+    OptixDenoiser optix_denoiser = nullptr;
+
+    /* Configuration size, as provided to `optixDenoiserSetup`.
+     * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+     * `is_configured` will be false. */
+    bool is_configured = false;
+    int2 configured_size = make_int2(0, 0);
+
+    /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+     * The memory layout goes as following: [denoiser state][scratch buffer]. */
+    device_only_memory<unsigned char> state;
+    size_t scratch_offset = 0;
+    size_t scratch_size = 0;
+
+    bool use_pass_albedo = false;
+    bool use_pass_normal = false;
+  };
+  Denoiser denoiser_;
+
+ public:
+  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+  ~OptiXDevice();
+
+ private:
+  BVHLayoutMask get_bvh_layout_mask() const override;
+
+  string compile_kernel_get_common_cflags(const uint kernel_features) override;
+
+  bool load_kernels(const uint kernel_features) override;
+
+  bool build_optix_bvh(BVHOptiX *bvh,
+                       OptixBuildOperation operation,
+                       const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  void release_optix_bvh(BVH *bvh) override;
+  void free_bvh_memory_delayed();
+
+  void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void update_launch_params(size_t offset, void *data, size_t data_size);
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  /* --------------------------------------------------------------------
+   * Denoising.
+   */
+
+  class DenoiseContext;
+  class DenoisePass;
+
+  virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
+  virtual DeviceQueue *get_denoise_queue() override;
+
+  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+   * OptiX and store in the guiding passes memory within the given context.
+   *
+   * Pre=-processing of the guiding passes is to only hapopen once per context lifetime. DO not
+   * preprocess them for every pass which is being denoised. */
+  bool denoise_filter_guiding_preprocess(DenoiseContext &context);
+
+  /* Set fake albedo pixels in the albedo guiding pass storage.
+   * After this point only passes which do not need albedo for denoising can be processed. */
+  bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
+
+  void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+   * input within the given context. Pixels are scaled to the number of samples, but are not
+   * preprocessed yet. */
+  void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+   * denoiser result to the render buffer. */
+  bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
+  bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Make sure the OptiX denoiser is created and configured. */
+  bool denoise_ensure(DenoiseContext &context);
+
+  /* Create OptiX denoiser descriptor if needed.
+   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+  bool denoise_create_if_needed(DenoiseContext &context);
+
+  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+  bool denoise_configure_if_needed(DenoiseContext &context);
+
+  /* Run configured denoiser. */
+  bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
new file mode 100644
index 00000000000..458ed70baa8
--- /dev/null
+++ b/intern/cycles/device/optix/queue.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/queue.h"
+#  include "device/optix/device_impl.h"
+
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device)
+{
+}
+
+void OptiXDeviceQueue::init_execution()
+{
+  CUDADeviceQueue::init_execution();
+}
+
+static bool is_optix_specific_kernel(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+}
+
+bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (!is_optix_specific_kernel(kernel)) {
+    return CUDADeviceQueue::enqueue(kernel, work_size, args);
+  }
+
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
+  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
+
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
+                        args[0],  // &d_path_index
+                        sizeof(device_ptr),
+                        cuda_stream_));
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    cuda_device_assert(
+        cuda_device_,
+        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+                          args[1],  // &d_render_buffer
+                          sizeof(device_ptr),
+                          cuda_stream_));
+  }
+
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+
+  OptixPipeline pipeline = nullptr;
+  OptixShaderBindingTable sbt_params = {};
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
+      break;
+
+    default:
+      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
+                 << " is attempted to be enqueued.";
+      return false;
+  }
+
+  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
+  sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord);
+  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
+  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+
+  /* Launch the ray generation program. */
+  optix_device_assert(optix_device,
+                      optixLaunch(pipeline,
+                                  cuda_stream_,
+                                  launch_params_ptr,
+                                  optix_device->launch_params.data_elements,
+                                  &sbt_params,
+                                  work_size,
+                                  1,
+                                  1));
+
+  return !(optix_device->have_error());
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h
new file mode 100644
index 00000000000..0de422ccc71
--- /dev/null
+++ b/intern/cycles/device/optix/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/queue.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDevice;
+
+/* Base class for CUDA queues. */
+class OptiXDeviceQueue : public CUDADeviceQueue {
+ public:
+  OptiXDeviceQueue(OptiXDevice *device);
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h
new file mode 100644
index 00000000000..34ae5bb5609
--- /dev/null
+++ b/intern/cycles/device/optix/util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/util.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+#    define OPTIX_DONT_INCLUDE_CUDA
+#  endif
+
+#  include <optix_stubs.h>
+
+/* Utility for checking return values of OptiX function calls. */
+#  define optix_device_assert(optix_device, stmt) \
+    { \
+      OptixResult result = stmt; \
+      if (result != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(result); \
+        optix_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define optix_assert(stmt) optix_device_assert(this, stmt)
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 57f25283f85..8294e716ebe 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -814,7 +814,7 @@ bool Node::socket_is_modified(const SocketType &input) const
   return (socket_modified & input.modified_flag_bit) != 0;
 }
 
-bool Node::is_modified()
+bool Node::is_modified() const
 {
   return socket_modified != 0;
 }
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index aa365baeccd..8f27a82d37b 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <type_traits>
+
 #include "graph/node_type.h"
 
 #include "util/util_array.h"
@@ -34,7 +36,10 @@ struct Transform;
 #define NODE_SOCKET_API_BASE_METHODS(type_, name, string_name) \
   const SocketType *get_##name##_socket() const \
   { \
-    static const SocketType *socket = type->find_input(ustring(string_name)); \
+    /* Explicitly cast to base class to use `Node::type` even if the derived class defines \
+     * `type`. */ \
+    const Node *self_node = this; \
+    static const SocketType *socket = self_node->type->find_input(ustring(string_name)); \
     return socket; \
   } \
   bool name##_is_modified() const \
@@ -111,6 +116,15 @@ struct Node {
   void set(const SocketType &input, const Transform &value);
   void set(const SocketType &input, Node *value);
 
+  /* Implicitly cast enums and enum classes to integer, which matches an internal way of how
+   * enumerator values are stored and accessed in a generic API. */
+  template<class ValueType, typename std::enable_if_t<std::is_enum_v<ValueType>> * = nullptr>
+  void set(const SocketType &input, const ValueType &value)
+  {
+    static_assert(sizeof(ValueType) <= sizeof(int), "Enumerator type should fit int");
+    set(input, static_cast<int>(value));
+  }
+
   /* set array values. the memory from the input array will taken over
    * by the node and the input array will be empty after return */
   void set(const SocketType &input, array<bool> &value);
@@ -164,7 +178,7 @@ struct Node {
 
   bool socket_is_modified(const SocketType &input) const;
 
-  bool is_modified();
+  bool is_modified() const;
 
   void tag_modified();
   void clear_modified();
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
new file mode 100644
index 00000000000..bfabd35d7c3
--- /dev/null
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(INC
+  ..
+)
+
+set(SRC
+  adaptive_sampling.cpp
+  denoiser.cpp
+  denoiser_device.cpp
+  denoiser_oidn.cpp
+  denoiser_optix.cpp
+  path_trace.cpp
+  tile.cpp
+  pass_accessor.cpp
+  pass_accessor_cpu.cpp
+  pass_accessor_gpu.cpp
+  path_trace_work.cpp
+  path_trace_work_cpu.cpp
+  path_trace_work_gpu.cpp
+  render_scheduler.cpp
+  shader_eval.cpp
+  work_balancer.cpp
+  work_tile_scheduler.cpp
+)
+
+set(SRC_HEADERS
+  adaptive_sampling.h
+  denoiser.h
+  denoiser_device.h
+  denoiser_oidn.h
+  denoiser_optix.h
+  path_trace.h
+  tile.h
+  pass_accessor.h
+  pass_accessor_cpu.h
+  pass_accessor_gpu.h
+  path_trace_work.h
+  path_trace_work_cpu.h
+  path_trace_work_gpu.h
+  render_scheduler.h
+  shader_eval.h
+  work_balancer.h
+  work_tile_scheduler.h
+)
+
+set(LIB
+  # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to
+  # avoid such cyclic dependency.
+  cycles_render
+
+  cycles_util
+)
+
+if(WITH_OPENIMAGEDENOISE)
+  list(APPEND LIB
+    ${OPENIMAGEDENOISE_LIBRARIES}
+  )
+endif()
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp
new file mode 100644
index 00000000000..23fbcfea5c2
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/adaptive_sampling.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+AdaptiveSampling::AdaptiveSampling()
+{
+}
+
+int AdaptiveSampling::align_samples(int start_sample, int num_samples) const
+{
+  if (!use) {
+    return num_samples;
+  }
+
+  /*
+   * The naive implementation goes as following:
+   *
+   *   int count = 1;
+   *   while (!need_filter(start_sample + count - 1) && count < num_samples) {
+   *     ++count;
+   *   }
+   *   return count;
+   */
+
+  /* 0-based sample index at which first filtering will happen. */
+  const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1);
+
+  /* Allow as many samples as possible until the first filter sample. */
+  if (start_sample + num_samples <= first_filter_sample) {
+    return num_samples;
+  }
+
+  const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1));
+
+  const int num_samples_until_filter = next_filter_sample - start_sample + 1;
+
+  return min(num_samples_until_filter, num_samples);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (!use) {
+    return false;
+  }
+
+  if (sample <= min_samples) {
+    return false;
+  }
+
+  return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h
new file mode 100644
index 00000000000..d98edd9894c
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  /* Align number of samples so that they align with the adaptive filtering.
+   *
+   * Returns the new value for the `num_samples` so that after rendering so many samples on top
+   * of `start_sample` filtering is required.
+   *
+   * The alignment happens in a way that allows to render as many samples as possible without
+   * missing any filtering point. This means that the result is "clamped" by the nearest sample
+   * at which filtering is needed. This is part of mechanism which ensures that all devices will
+   * perform same exact filtering and adaptive sampling, regardless of their performance.
+   *
+   * `start_sample` is the 0-based index of sample.
+   *
+   * NOTE: The start sample is included into the number of samples to render. This means that
+   * if the number of samples is 1, then the path tracer will render samples [align_samples],
+   * if the number of samples is 2, then the path tracer will render samples [align_samples,
+   * align_samples + 1] and so on. */
+  int align_samples(int start_sample, int num_samples) const;
+
+  /* Check whether adaptive sampling filter should happen at this sample.
+   * Returns false if the adaptive sampling is not use.
+   *
+   * `sample` is the 0-based index of sample. */
+  bool need_filter(int sample) const;
+
+  bool use = false;
+  int adaptive_step = 0;
+  int min_samples = 0;
+  float threshold = 0.0f;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
new file mode 100644
index 00000000000..598bbd497a5
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser.h"
+
+#include "device/device.h"
+#include "integrator/denoiser_oidn.h"
+#include "integrator/denoiser_optix.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams &params)
+{
+  DCHECK(params.use);
+
+  switch (params.type) {
+    case DENOISER_OPTIX:
+      return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+    case DENOISER_OPENIMAGEDENOISE:
+      return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      /* pass */
+      break;
+  }
+
+  LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+  return nullptr;
+}
+
+Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
+    : path_trace_device_(path_trace_device), params_(params)
+{
+  DCHECK(params.use);
+}
+
+void Denoiser::set_params(const DenoiseParams &params)
+{
+  DCHECK_EQ(params.type, params_.type);
+
+  if (params.type == params_.type) {
+    params_ = params;
+  }
+  else {
+    LOG(ERROR) << "Attempt to change denoiser type.";
+  }
+}
+
+const DenoiseParams &Denoiser::get_params() const
+{
+  return params_;
+}
+
+bool Denoiser::load_kernels(Progress *progress)
+{
+  const Device *denoiser_device = ensure_denoiser_device(progress);
+
+  if (!denoiser_device) {
+    path_trace_device_->set_error("No device available to denoise on");
+    return false;
+  }
+
+  VLOG(3) << "Will denoise on " << denoiser_device->info.description << " ("
+          << denoiser_device->info.id << ")";
+
+  return true;
+}
+
+Device *Denoiser::get_denoiser_device() const
+{
+  return denoiser_device_;
+}
+
+/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
+static bool is_single_supported_device(Device *device, DenoiserType type)
+{
+  if (device->info.type == DEVICE_MULTI) {
+    /* Assume multi-device is never created with a single sub-device.
+     * If one requests such configuration it should be checked on the session level. */
+    return false;
+  }
+
+  if (!device->info.multi_devices.empty()) {
+    /* Some configurations will use multi_devices, but keep the type of an individual device.
+     * This does simplify checks for homogenous setups, but here we really need a single device. */
+    return false;
+  }
+
+  /* Check the denoiser type is supported. */
+  return (device->info.denoisers & type);
+}
+
+/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
+ * multi-device.
+ *
+ * If there is no device available which supports given denoiser type nullptr is returned. */
+static Device *find_best_device(Device *device, DenoiserType type)
+{
+  Device *best_device = nullptr;
+
+  device->foreach_device([&](Device *sub_device) {
+    if ((sub_device->info.denoisers & type) == 0) {
+      return;
+    }
+    if (!best_device) {
+      best_device = sub_device;
+    }
+    else {
+      /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
+       * of the device and data transfer cost. */
+    }
+  });
+
+  return best_device;
+}
+
+static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
+                                                 const uint device_type_mask)
+{
+  const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
+  if (device_infos.empty()) {
+    return nullptr;
+  }
+
+  /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
+   * a physical CUDA device which is already used for rendering. */
+
+  /* TODO(sergey): Choose fastest device for denoising. */
+
+  const DeviceInfo denoiser_device_info = device_infos.front();
+
+  unique_ptr<Device> denoiser_device(
+      Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
+
+  if (!denoiser_device) {
+    return nullptr;
+  }
+
+  if (denoiser_device->have_error()) {
+    return nullptr;
+  }
+
+  /* Only need denoising feature, everything else is unused. */
+  if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) {
+    return nullptr;
+  }
+
+  return denoiser_device;
+}
+
+Device *Denoiser::ensure_denoiser_device(Progress *progress)
+{
+  /* The best device has been found already, avoid sequential lookups.
+   * Additionally, avoid device re-creation if it has failed once. */
+  if (denoiser_device_ || device_creation_attempted_) {
+    return denoiser_device_;
+  }
+
+  /* Simple case: rendering happens on a single device which also supports denoiser. */
+  if (is_single_supported_device(path_trace_device_, params_.type)) {
+    denoiser_device_ = path_trace_device_;
+    return denoiser_device_;
+  }
+
+  /* Find best device from the ones which are already used for rendering. */
+  denoiser_device_ = find_best_device(path_trace_device_, params_.type);
+  if (denoiser_device_) {
+    return denoiser_device_;
+  }
+
+  if (progress) {
+    progress->set_status("Loading denoising kernels (may take a few minutes the first time)");
+  }
+
+  device_creation_attempted_ = true;
+
+  const uint device_type_mask = get_device_type_mask();
+  local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+  denoiser_device_ = local_denoiser_device_.get();
+
+  return denoiser_device_;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h
new file mode 100644
index 00000000000..3101b45e31b
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the
+ * better place is figured out. */
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "util/util_function.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class RenderBuffers;
+class Progress;
+
+/* Implementation of a specific denoising algorithm.
+ *
+ * This class takes care of breaking down denosiing algorithm into a series of device calls or to
+ * calls of an external API to denoise given input.
+ *
+ * TODO(sergey): Are we better with device or a queue here? */
+class Denoiser {
+ public:
+  /* Create denoiser for the given path trace device.
+   *
+   * Notes:
+   * - The denoiser must be configured. This means that `params.use` must be true.
+   *   This is checked in debug builds.
+   * - The device might be MultiDevice. */
+  static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual ~Denoiser() = default;
+
+  void set_params(const DenoiseParams &params);
+  const DenoiseParams &get_params() const;
+
+  /* Create devices and load kernels needed for denoising.
+   * The progress is used to communicate state when kenrels actually needs to be loaded.
+   *
+   * NOTE: The `progress` is an optional argument, can be nullptr. */
+  virtual bool load_kernels(Progress *progress);
+
+  /* Denoise the entire buffer.
+   *
+   * Buffer parameters denotes an effective parameters used during rendering. It could be
+   * a lower resolution render into a bigger allocated buffer, which is used in viewport during
+   * navigation and non-unit pixel size. Use that instead of render_buffers->params.
+   *
+   * The buffer might be copming from a "foreign" device from what this denoise is created for.
+   * This means that in general case the denoiser will make sure the input data is available on
+   * the denoiser device, perform denoising, and put data back to the device where the buffer
+   * came from.
+   *
+   * The `num_samples` corresponds to the number of samples in the render buffers. It is used
+   * to scale buffers down to the "final" value in algorithms which don't do automatic exposure,
+   * or which needs "final" value for data passes.
+   *
+   * The `allow_inplace_modification` means that the denoiser is allowed to do in-place
+   * modification of the input passes (scaling them down i.e.). This will lower the memory
+   * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of
+   * view.
+   *
+   * Returns true when all passes are denoised. Will return false if there is a denoiser error (for
+   * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) = 0;
+
+  /* Get a device which is used to perform actual denoising.
+   *
+   * Notes:
+   *
+   * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then,
+   *
+   * - The device can be different from the path tracing device. This happens, for example, when
+   *   using OptiX denoiser and rendering on CPU.
+   *
+   * - No threading safety is ensured in this call. This means, that it is up to caller to ensure
+   *   that there is no threadingconflict between denoising task lazily initializing the device and
+   *   access to this device happen. */
+  Device *get_denoiser_device() const;
+
+  function<bool(void)> is_cancelled_cb;
+
+  bool is_cancelled() const
+  {
+    if (!is_cancelled_cb) {
+      return false;
+    }
+    return is_cancelled_cb();
+  }
+
+ protected:
+  Denoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  /* Make sure denoising device is initialized. */
+  virtual Device *ensure_denoiser_device(Progress *progress);
+
+  /* Get device type mask which is used to filter available devices when new device needs to be
+   * created. */
+  virtual uint get_device_type_mask() const = 0;
+
+  Device *path_trace_device_;
+  DenoiseParams params_;
+
+  /* Cached pointer to the device on which denoising will happen.
+   * Used to avoid lookup of a device for every denoising request. */
+  Device *denoiser_device_ = nullptr;
+
+  /* Denoiser device which was created to perform denoising in the case the none of the rendering
+   * devices are capable of denoising. */
+  unique_ptr<Device> local_denoiser_device_;
+  bool device_creation_attempted_ = false;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
new file mode 100644
index 00000000000..8088cfd7800
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_device.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+}
+
+DeviceDenoiser::~DeviceDenoiser()
+{
+  /* Explicit implementation, to allow forward declaration of Device in the header. */
+}
+
+bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                    RenderBuffers *render_buffers,
+                                    const int num_samples,
+                                    bool allow_inplace_modification)
+{
+  Device *denoiser_device = get_denoiser_device();
+  if (!denoiser_device) {
+    return false;
+  }
+
+  DeviceDenoiseTask task;
+  task.params = params_;
+  task.num_samples = num_samples;
+  task.buffer_params = buffer_params;
+  task.allow_inplace_modification = allow_inplace_modification;
+
+  RenderBuffers local_render_buffers(denoiser_device);
+  bool local_buffer_used = false;
+
+  if (denoiser_device == render_buffers->buffer.device) {
+    /* The device can access an existing buffer pointer. */
+    local_buffer_used = false;
+    task.render_buffers = render_buffers;
+  }
+  else {
+    VLOG(3) << "Creating temporary buffer on denoiser device.";
+
+    DeviceQueue *queue = denoiser_device->get_denoise_queue();
+
+    /* Create buffer which is available by the device used by denoiser. */
+
+    /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
+     * ignoring other light ad data passes. */
+
+    local_buffer_used = true;
+
+    render_buffers->copy_from_device();
+
+    local_render_buffers.reset(buffer_params);
+
+    /* NOTE: The local buffer is allocated for an exact size of the effective render size, while
+     * the input render buffer is allcoated for the lowest resolution divider possible. So it is
+     * important to only copy actually needed part of the input buffer. */
+    memcpy(local_render_buffers.buffer.data(),
+           render_buffers->buffer.data(),
+           sizeof(float) * local_render_buffers.buffer.size());
+
+    queue->copy_to_device(local_render_buffers.buffer);
+
+    task.render_buffers = &local_render_buffers;
+    task.allow_inplace_modification = true;
+  }
+
+  const bool denoise_result = denoiser_device->denoise_buffer(task);
+
+  if (local_buffer_used) {
+    local_render_buffers.copy_from_device();
+
+    render_buffers_host_copy_denoised(
+        render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+
+    render_buffers->copy_to_device();
+  }
+
+  return denoise_result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h
new file mode 100644
index 00000000000..0fd934dba79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
+ * implemented as a part of a driver of specific device.
+ *
+ * This implementation makes sure the to-be-denoised buffer is available on the denoising device
+ * and invoke denoising kernel via device API. */
+class DeviceDenoiser : public Denoiser {
+ public:
+  DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
+  ~DeviceDenoiser();
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
new file mode 100644
index 00000000000..1b5a012ec87
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_oidn.h"
+
+#include <array>
+
+#include "device/device.h"
+#include "device/device_queue.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "render/buffers.h"
+#include "util/util_array.h"
+#include "util/util_logging.h"
+#include "util/util_openimagedenoise.h"
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+thread_mutex OIDNDenoiser::mutex_;
+
+OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+  DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
+
+  DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
+}
+
+#ifdef WITH_OPENIMAGEDENOISE
+static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
+{
+  OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
+  return !oidn_denoiser->is_cancelled();
+}
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+class OIDNPass {
+ public:
+  OIDNPass() = default;
+
+  OIDNPass(const BufferParams &buffer_params,
+           const char *name,
+           PassType type,
+           PassMode mode = PassMode::NOISY)
+      : name(name), type(type), mode(mode)
+  {
+    offset = buffer_params.get_pass_offset(type, mode);
+    need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  inline operator bool() const
+  {
+    return name[0] != '\0';
+  }
+
+  /* Name of an image which will be passed to the OIDN library.
+   * Should be one of the following: color, albedo, normal, output.
+   * The albedo and normal images are optional. */
+  const char *name = "";
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  int num_components = -1;
+  bool use_compositing = false;
+  bool use_denoising_albedo = true;
+
+  /* Offset of beginning of this pass in the render buffers. */
+  int offset = -1;
+
+  /* Denotes whether the data is to be scaled down with the number of passes.
+   * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so
+   * scaling is not needed for the color pass unless adaptive sampling is used.
+   *
+   * NOTE: Do not scale the outout pass, as that requires to be a pointer in the original buffer.
+   * All the scaling on the output needed for integration with adaptive sampling will happen
+   * outside of generic pass handling. */
+  bool need_scale = false;
+
+  /* The content of the pass has been pre-filtered. */
+  bool is_filtered = false;
+
+  /* For the scaled passes, the data which holds values of scaled pixels. */
+  array<float> scaled_buffer;
+};
+
+class OIDNDenoiseContext {
+ public:
+  OIDNDenoiseContext(OIDNDenoiser *denoiser,
+                     const DenoiseParams &denoise_params,
+                     const BufferParams &buffer_params,
+                     RenderBuffers *render_buffers,
+                     const int num_samples,
+                     const bool allow_inplace_modification)
+      : denoiser_(denoiser),
+        denoise_params_(denoise_params),
+        buffer_params_(buffer_params),
+        render_buffers_(render_buffers),
+        num_samples_(num_samples),
+        allow_inplace_modification_(allow_inplace_modification),
+        pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT))
+  {
+    if (denoise_params_.use_pass_albedo) {
+      oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO);
+    }
+
+    if (denoise_params_.use_pass_normal) {
+      oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL);
+    }
+  }
+
+  bool need_denoising() const
+  {
+    if (buffer_params_.width == 0 && buffer_params_.height == 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /* Make the guiding passes available by a sequential denoising of various passes. */
+  void read_guiding_passes()
+  {
+    read_guiding_pass(oidn_albedo_pass_);
+    read_guiding_pass(oidn_normal_pass_);
+  }
+
+  void denoise_pass(const PassType pass_type)
+  {
+    OIDNPass oidn_color_pass(buffer_params_, "color", pass_type);
+    if (oidn_color_pass.offset == PASS_UNUSED) {
+      return;
+    }
+
+    if (oidn_color_pass.use_denoising_albedo) {
+      if (albedo_replaced_with_fake_) {
+        LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+        return;
+      }
+    }
+
+    OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED);
+    if (oidn_output_pass.offset == PASS_UNUSED) {
+      LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+      return;
+    }
+
+    OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass);
+
+    oidn::DeviceRef oidn_device = oidn::newDevice();
+    oidn_device.commit();
+
+    /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too.
+     */
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_input_pass(oidn_filter, oidn_color_access_pass);
+    set_guiding_passes(oidn_filter, oidn_color_pass);
+    set_output_pass(oidn_filter, oidn_output_pass);
+    oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_);
+    oidn_filter.set("hdr", true);
+    oidn_filter.set("srgb", false);
+    if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE ||
+        denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) {
+      oidn_filter.set("cleanAux", true);
+    }
+    oidn_filter.commit();
+
+    filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_);
+    filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_);
+
+    /* Filter the beauty image. */
+    oidn_filter.execute();
+
+    /* Check for errors. */
+    const char *error_message;
+    const oidn::Error error = oidn_device.getError(error_message);
+    if (error != oidn::Error::None && error != oidn::Error::Cancelled) {
+      LOG(ERROR) << "OpenImageDenoise error: " << error_message;
+    }
+
+    postprocess_output(oidn_color_pass, oidn_output_pass);
+  }
+
+ protected:
+  void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass)
+  {
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass ||
+        oidn_pass.is_filtered) {
+      return;
+    }
+
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_pass(oidn_filter, oidn_pass);
+    set_output_pass(oidn_filter, oidn_pass);
+    oidn_filter.commit();
+    oidn_filter.execute();
+
+    oidn_pass.is_filtered = true;
+  }
+
+  /* Make pixels of a guiding pass available by the denoiser. */
+  void read_guiding_pass(OIDNPass &oidn_pass)
+  {
+    if (!oidn_pass) {
+      return;
+    }
+
+    DCHECK(!oidn_pass.use_compositing);
+
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE &&
+        !is_pass_scale_needed(oidn_pass)) {
+      /* Pass data is available as-is from the render buffers. */
+      return;
+    }
+
+    if (allow_inplace_modification_) {
+      scale_pass_in_render_buffers(oidn_pass);
+      return;
+    }
+
+    read_pass_pixels_into_buffer(oidn_pass);
+  }
+
+  /* Special reader of the input pass.
+   * To save memory it will read pixels into the output, and let the denoiser to perform an
+   * in-place operation. */
+  OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    const bool use_compositing = oidn_input_pass.use_compositing;
+
+    /* Simple case: no compositing is involved, no scaling is needed.
+     * The pass pixels will be referenced as-is, without extra processing. */
+    if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) {
+      return oidn_input_pass;
+    }
+
+    float *buffer_data = render_buffers_->buffer.data();
+    float *pass_data = buffer_data + oidn_output_pass.offset;
+
+    PassAccessor::Destination destination(pass_data, 3);
+    destination.pixel_stride = buffer_params_.pass_stride;
+
+    read_pass_pixels(oidn_input_pass, destination);
+
+    OIDNPass oidn_input_pass_at_output = oidn_input_pass;
+    oidn_input_pass_at_output.offset = oidn_output_pass.offset;
+
+    return oidn_input_pass_at_output;
+  }
+
+  /* Read pass pixels using PassAccessor into the given destination. */
+  void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
+  {
+    PassAccessor::PassAccessInfo pass_access_info;
+    pass_access_info.type = oidn_pass.type;
+    pass_access_info.mode = oidn_pass.mode;
+    pass_access_info.offset = oidn_pass.offset;
+
+    /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+     * on the approximation. The latter is not even possible because OIDN does not support
+     * denoising of semi-transparent pixels. */
+    pass_access_info.use_approximate_shadow_catcher = false;
+    pass_access_info.use_approximate_shadow_catcher_background = false;
+    pass_access_info.show_active_pixels = false;
+
+    /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured
+     * by users. What is important is to use same exposure for read and write access of the pass
+     * pixels. */
+    const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
+
+    pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
+  }
+
+  /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */
+  void read_pass_pixels_into_buffer(OIDNPass &oidn_pass)
+  {
+    VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+            << pass_type_as_string(oidn_pass.type) << ")";
+
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+    scaled_buffer.resize(width * height * 3);
+
+    const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+    read_pass_pixels(oidn_pass, destination);
+  }
+
+  /* Set OIDN image to reference pixels from the given render buffer pass.
+   * No transform to the pixels is done, no additional memory is used. */
+  void set_pass_referenced(oidn::FilterRef &oidn_filter,
+                           const char *name,
+                           const OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+
+    const int64_t pixel_index = offset + x + y * stride;
+    const int64_t buffer_offset = pixel_index * pass_stride;
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    oidn_filter.setImage(name,
+                         buffer_data + buffer_offset + oidn_pass.offset,
+                         oidn::Format::Float3,
+                         width,
+                         height,
+                         0,
+                         pass_stride * sizeof(float),
+                         stride * pass_stride * sizeof(float));
+  }
+
+  void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    oidn_filter.setImage(
+        name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0);
+  }
+
+  void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+  void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    if (oidn_pass.scaled_buffer.empty()) {
+      set_pass_referenced(oidn_filter, name, oidn_pass);
+    }
+    else {
+      set_pass_from_buffer(oidn_filter, name, oidn_pass);
+    }
+  }
+
+  void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+
+  void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    if (oidn_albedo_pass_) {
+      if (oidn_pass.use_denoising_albedo) {
+        set_pass(oidn_filter, oidn_albedo_pass_);
+      }
+      else {
+        /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been
+         * provided. */
+        set_fake_albedo_pass(oidn_filter);
+      }
+    }
+
+    if (oidn_normal_pass_) {
+      set_pass(oidn_filter, oidn_normal_pass_);
+    }
+  }
+
+  void set_fake_albedo_pass(oidn::FilterRef &oidn_filter)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    if (!albedo_replaced_with_fake_) {
+      const int64_t num_pixel_components = width * height * 3;
+      oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components);
+
+      for (int i = 0; i < num_pixel_components; ++i) {
+        oidn_albedo_pass_.scaled_buffer[i] = 0.5f;
+      }
+
+      albedo_replaced_with_fake_ = true;
+    }
+
+    set_pass(oidn_filter, oidn_albedo_pass_);
+  }
+
+  void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, "output", oidn_pass);
+  }
+
+  /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel
+   * back. */
+  void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components);
+
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+    const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing;
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *denoised_pixel = buffer_pixel + oidn_output_pass.offset;
+
+        if (need_scale) {
+          const float pixel_scale = has_pass_sample_count ?
+                                        __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                        num_samples_;
+
+          denoised_pixel[0] = denoised_pixel[0] * pixel_scale;
+          denoised_pixel[1] = denoised_pixel[1] * pixel_scale;
+          denoised_pixel[2] = denoised_pixel[2] * pixel_scale;
+        }
+
+        if (oidn_output_pass.num_components == 3) {
+          /* Pass without alpha channel. */
+        }
+        else if (!oidn_input_pass.use_compositing) {
+          /* Currently compositing passes are either 3-component (derived by dividing light passes)
+           * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+           * simplifies logic and avoids extra memory allocation. */
+          const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset;
+          denoised_pixel[3] = noisy_pixel[3];
+        }
+        else {
+          /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+           * is an opaque pixel for 4 component passes. */
+          denoised_pixel[3] = 0;
+        }
+      }
+    }
+  }
+
+  bool is_pass_scale_needed(OIDNPass &oidn_pass) const
+  {
+    if (pass_sample_count_ != PASS_UNUSED) {
+      /* With adaptive sampling pixels will have different number of samples in them, so need to
+       * always scale the pass to make pixels uniformly sampled. */
+      return true;
+    }
+
+    if (!oidn_pass.need_scale) {
+      return false;
+    }
+
+    if (num_samples_ == 1) {
+      /* If the avoid scaling if there is only one sample, to save up time (so we dont divide
+       * buffer by 1). */
+      return false;
+    }
+
+    return true;
+  }
+
+  void scale_pass_in_render_buffers(OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *pass_pixel = buffer_pixel + oidn_pass.offset;
+
+        const float pixel_scale = 1.0f / (has_pass_sample_count ?
+                                              __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                              num_samples_);
+
+        pass_pixel[0] = pass_pixel[0] * pixel_scale;
+        pass_pixel[1] = pass_pixel[1] * pixel_scale;
+        pass_pixel[2] = pass_pixel[2] * pixel_scale;
+      }
+    }
+  }
+
+  OIDNDenoiser *denoiser_ = nullptr;
+
+  const DenoiseParams &denoise_params_;
+  const BufferParams &buffer_params_;
+  RenderBuffers *render_buffers_ = nullptr;
+  int num_samples_ = 0;
+  bool allow_inplace_modification_ = false;
+  int pass_sample_count_ = PASS_UNUSED;
+
+  /* Optional albedo and normal passes, reused by denoising of different pass types. */
+  OIDNPass oidn_albedo_pass_;
+  OIDNPass oidn_normal_pass_;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake_ = false;
+};
+#endif
+
+static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
+{
+  Device *device = render_buffers->buffer.device;
+  if (device->info.has_gpu_queue) {
+    return device->gpu_queue_create();
+  }
+  return nullptr;
+}
+
+static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue,
+                                            RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_from_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_from_device();
+  }
+}
+
+static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
+                                          RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_to_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_to_device();
+  }
+}
+
+bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                  RenderBuffers *render_buffers,
+                                  const int num_samples,
+                                  bool allow_inplace_modification)
+{
+  thread_scoped_lock lock(mutex_);
+
+  /* Make sure the host-side data is available for denoising. */
+  unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
+  copy_render_buffers_from_device(queue, render_buffers);
+
+#ifdef WITH_OPENIMAGEDENOISE
+  OIDNDenoiseContext context(
+      this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);
+
+  if (context.need_denoising()) {
+    context.read_guiding_passes();
+
+    const std::array<PassType, 3> passes = {
+        {/* Passes which will use real albedo when it is available. */
+         PASS_COMBINED,
+         PASS_SHADOW_CATCHER_MATTE,
+
+         /* Passes which do not need albedo and hence if real is present it needs to become fake.
+          */
+         PASS_SHADOW_CATCHER}};
+
+    for (const PassType pass_type : passes) {
+      context.denoise_pass(pass_type);
+      if (is_cancelled()) {
+        return false;
+      }
+    }
+
+    /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code
+     * copies data from the device it doesn't overwrite the denoiser buffers. */
+    copy_render_buffers_to_device(queue, render_buffers);
+  }
+#endif
+
+  /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
+   * it up here all passes are properly denoised. */
+  return true;
+}
+
+uint OIDNDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_CPU;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
new file mode 100644
index 00000000000..566e761ae79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of denoising API which uses OpenImageDenoise library. */
+class OIDNDenoiser : public Denoiser {
+ public:
+  /* Forwardly declared state which might be using compile-flag specific fields, such as
+   * OpenImageDenoise device and filter handles. */
+  class State;
+
+  OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+
+  /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
+   * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
+  static thread_mutex mutex_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/integrator/denoiser_optix.cpp
index ed64ae01aae..5f9de23bfe6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,21 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_direct_lighting.h"
+#include "integrator/denoiser_optix.h"
 
-#define KERNEL_NAME direct_lighting
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "device/device.h"
+#include "device/device_denoise.h"
 
+CCL_NAMESPACE_BEGIN
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : DeviceDenoiser(path_trace_device, params)
+{
+}
+
+uint OptiXDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_OPTIX;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/integrator/denoiser_optix.h
index c314dc96c33..a8df770ecf7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_lamp_emission.h"
+#pragma once
 
-#define KERNEL_NAME lamp_emission
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "integrator/denoiser_device.h"
 
+CCL_NAMESPACE_BEGIN
+
+class OptiXDenoiser : public DeviceDenoiser {
+ public:
+  OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
new file mode 100644
index 00000000000..87c048b1fa5
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Pass input information.
+ */
+
+PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass)
+    : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass destination.
+ */
+
+PassAccessor::Destination::Destination(float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
+    : Destination(pass_type)
+{
+  pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
+{
+  const PassInfo pass_info = Pass::get_info(pass_type);
+  num_components = pass_info.num_components;
+}
+
+/* --------------------------------------------------------------------
+ * Pass source.
+ */
+
+PassAccessor::Source::Source(const float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessor.
+ */
+
+PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples)
+    : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples)
+{
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  return get_render_tile_pixels(render_buffers, render_buffers->params, destination);
+}
+
+static void pad_pixels(const BufferParams &buffer_params,
+                       const PassAccessor::Destination &destination,
+                       const int src_num_components)
+{
+  /* When requesting a single channel pass as RGBA, or RGB pass as RGBA,
+   * fill in the additional components for convenience. */
+  const int dest_num_components = destination.num_components;
+
+  if (src_num_components >= dest_num_components) {
+    return;
+  }
+
+  const size_t size = buffer_params.width * buffer_params.height;
+  if (destination.pixels) {
+    float *pixel = destination.pixels;
+
+    for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[1] = pixel[0];
+        pixel[2] = pixel[0];
+      }
+      if (dest_num_components >= 4) {
+        pixel[3] = 1.0f;
+      }
+    }
+  }
+
+  if (destination.pixels_half_rgba) {
+    const half one = float_to_half(1.0f);
+    half4 *pixel = destination.pixels_half_rgba;
+
+    for (size_t i = 0; i < size; i++, pixel++) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[0].y = pixel[0].x;
+        pixel[0].z = pixel[0].x;
+      }
+      if (dest_num_components >= 4) {
+        pixel[0].w = one;
+      }
+    }
+  }
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const BufferParams &buffer_params,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  if (pass_access_info_.offset == PASS_UNUSED) {
+    return false;
+  }
+
+  const PassType type = pass_access_info_.type;
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+
+  if (pass_info.num_components == 1) {
+    /* Single channel passes. */
+    if (mode == PassMode::DENOISED) {
+      /* Denoised passes store their final pixels, no need in special calculation. */
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_RENDER_TIME) {
+      /* TODO(sergey): Needs implementation. */
+    }
+    else if (type == PASS_DEPTH) {
+      get_pass_depth(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_MIST) {
+      get_pass_mist(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SAMPLE_COUNT) {
+      get_pass_sample_count(render_buffers, buffer_params, destination);
+    }
+    else {
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+  }
+  else if (type == PASS_MOTION) {
+    /* Motion pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components";
+    get_pass_motion(render_buffers, buffer_params, destination);
+  }
+  else if (type == PASS_CRYPTOMATTE) {
+    /* Cryptomatte pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components";
+    get_pass_cryptomatte(render_buffers, buffer_params, destination);
+  }
+  else {
+    /* RGB, RGBA and vector passes. */
+    DCHECK(destination.num_components == 3 || destination.num_components == 4)
+        << pass_type_as_string(type) << " pass must have 3 or 4 components";
+
+    if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
+      /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
+       * to approximate shadow with). */
+      get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) {
+      /* Shadow catcher pass. */
+      get_pass_shadow_catcher(render_buffers, buffer_params, destination);
+    }
+    else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
+              pass_info.indirect_type != PASS_NONE) &&
+             mode != PassMode::DENOISED) {
+      /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+      get_pass_light_path(render_buffers, buffer_params, destination);
+    }
+    else {
+      /* Passes that need no special computation, or denoised passes that already
+       * had the computation done. */
+      if (pass_info.num_components == 3) {
+        get_pass_float3(render_buffers, buffer_params, destination);
+      }
+      else if (pass_info.num_components == 4) {
+        if (destination.num_components == 3) {
+          /* Special case for denoiser access of RGBA passes ignoring alpha channel. */
+          get_pass_float3(render_buffers, buffer_params, destination);
+        }
+        else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER ||
+                 type == PASS_SHADOW_CATCHER_MATTE) {
+          /* Passes with transparency as 4th component. */
+          get_pass_combined(render_buffers, buffer_params, destination);
+        }
+        else {
+          /* Passes with alpha as 4th component. */
+          get_pass_float4(render_buffers, buffer_params, destination);
+        }
+      }
+    }
+  }
+
+  pad_pixels(buffer_params, destination, pass_info.num_components);
+
+  return true;
+}
+
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination) const
+{
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+                                             pass_access_info_.include_albedo);
+
+  kfilm_convert->pass_offset = pass_access_info_.offset;
+  kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+  kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+  kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+  /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */
+  if (pass_info.direct_type != PASS_NONE) {
+    kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type);
+  }
+  kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type);
+  kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+  kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+  kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+      PASS_ADAPTIVE_AUX_BUFFER);
+  kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+  kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode);
+  kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_MATTE, mode);
+
+  /* Background is not denoised, so always use noisy pass. */
+  kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
+
+  if (pass_info.use_filter) {
+    kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+  }
+  else {
+    kfilm_convert->scale = 1.0f;
+  }
+
+  if (pass_info.use_exposure) {
+    kfilm_convert->exposure = exposure_;
+  }
+  else {
+    kfilm_convert->exposure = 1.0f;
+  }
+
+  kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+  kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+  kfilm_convert->use_approximate_shadow_catcher_background =
+      pass_access_info_.use_approximate_shadow_catcher_background;
+  kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+
+  kfilm_convert->num_components = destination.num_components;
+  kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                           destination.num_components;
+
+  kfilm_convert->is_denoised = (mode == PassMode::DENOISED);
+}
+
+bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source)
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+                                            pass_access_info_.include_albedo);
+
+  const BufferParams &buffer_params = render_buffers->params;
+
+  float *buffer_data = render_buffers->buffer.data();
+  const int size = buffer_params.width * buffer_params.height;
+
+  const int out_stride = buffer_params.pass_stride;
+  const int in_stride = source.num_components;
+  const int num_components_to_copy = min(source.num_components, pass_info.num_components);
+
+  float *out = buffer_data + pass_access_info_.offset;
+  const float *in = source.pixels + source.offset * in_stride;
+
+  for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+    memcpy(out, in, sizeof(float) * num_components_to_copy);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
new file mode 100644
index 00000000000..624bf7d0b2c
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/pass.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class RenderBuffers;
+class BufferPass;
+class BufferParams;
+struct KernelFilmConvert;
+
+/* Helper class which allows to access pass data.
+ * Is designed in a way that it is created once when the pass data is known, and then pixels gets
+ * progressively update from various render buffers. */
+class PassAccessor {
+ public:
+  class PassAccessInfo {
+   public:
+    PassAccessInfo() = default;
+    explicit PassAccessInfo(const BufferPass &pass);
+
+    PassType type = PASS_NONE;
+    PassMode mode = PassMode::NOISY;
+    bool include_albedo = false;
+    int offset = -1;
+
+    /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its
+     * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop.
+     */
+    bool use_approximate_shadow_catcher = false;
+
+    /* When approximate shadow catcher matte is used alpha-over the result on top of background. */
+    bool use_approximate_shadow_catcher_background = false;
+
+    bool show_active_pixels = false;
+  };
+
+  class Destination {
+   public:
+    Destination() = default;
+    Destination(float *pixels, int num_components);
+    Destination(const PassType pass_type, half4 *pixels);
+
+    /* Destination will be initialized with the number of components which is native for the given
+     * pass type. */
+    explicit Destination(const PassType pass_type);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    float *pixels = nullptr;
+    half4 *pixels_half_rgba = nullptr;
+
+    /* Device-side pointers. */
+    device_ptr d_pixels = 0;
+    device_ptr d_pixels_half_rgba = 0;
+
+    /* Number of components per pixel in the floating-point destination.
+     * Is ignored for half4 destination (where number of components is implied to be 4). */
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+
+    /* Number of floats per pixel. When zero is the same as `num_components`.
+     *
+     * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+     * half-floats. */
+    int pixel_stride = 0;
+
+    /* Row stride in pixel elements:
+     *  - For the float destination stride is a number of floats per row.
+     *  - For the half4 destination stride is a number of half4 per row. */
+    int stride = 0;
+  };
+
+  class Source {
+   public:
+    Source() = default;
+    Source(const float *pixels, int num_components);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    const float *pixels = nullptr;
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+  };
+
+  PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
+
+  virtual ~PassAccessor() = default;
+
+  /* Get pass data from the given render buffers, perform needed filtering, and store result into
+   * the pixels.
+   * The result is stored sequentially starting from the very beginning of the pixels memory. */
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const Destination &destination) const;
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const BufferParams &buffer_params,
+                              const Destination &destination) const;
+  /* Set pass data for the given render buffers. Used for baking to read from passes. */
+  bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source);
+
+ protected:
+  virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                        const BufferParams &buffer_params,
+                                        const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const = 0;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+
+  PassAccessInfo pass_access_info_;
+
+  float exposure_ = 0.0f;
+  int num_samples_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
new file mode 100644
index 00000000000..3c6691f6d43
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_film.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Kernel processing.
+ */
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                                           const BufferParams &buffer_params,
+                                                           const Destination &destination,
+                                                           const Processor &processor) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  if (destination.pixels) {
+    /* NOTE: No overlays are applied since they are not used for final renders.
+     * Can be supported via some sort of specialization to avoid code duplication. */
+
+    run_get_pass_kernel_processor_float(
+        &kfilm_convert, render_buffers, buffer_params, destination, processor);
+  }
+
+  if (destination.pixels_half_rgba) {
+    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+    if (destination.num_components == 1) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                float pixel;
+                                                processor(kfilm_convert, buffer, &pixel);
+
+                                                pixel_rgba[0] = pixel;
+                                                pixel_rgba[1] = pixel;
+                                                pixel_rgba[2] = pixel;
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 3) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                processor(kfilm_convert, buffer, pixel_rgba);
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 4) {
+      run_get_pass_kernel_processor_half_rgba(
+          &kfilm_convert, render_buffers, buffer_params, destination, processor);
+    }
+  }
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+  const float *buffer_data = render_buffers->buffer.data();
+  const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                      destination.num_components;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+      float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
+
+      processor(kfilm_convert, buffer, pixel);
+    }
+  });
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  const float *buffer_data = render_buffers->buffer.data();
+
+  half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    half4 *dst_row_start = dst_start + y * destination_stride;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+
+      float pixel[4];
+      processor(kfilm_convert, buffer, pixel);
+
+      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+      half4 *pixel_half_rgba = dst_row_start + x;
+      float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+    }
+  });
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass) \
+  void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_get_pass_kernel_processor( \
+        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth)
+DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(sample_count)
+DEFINE_PASS_ACCESSOR(float)
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path)
+DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(float3)
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion)
+DEFINE_PASS_ACCESSOR(cryptomatte)
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+DEFINE_PASS_ACCESSOR(combined)
+DEFINE_PASS_ACCESSOR(float4)
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
new file mode 100644
index 00000000000..0313dc5bb0d
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelFilmConvert;
+
+/* Pass accessor implementation for CPU side. */
+class PassAccessorCPU : public PassAccessor {
+ public:
+  using PassAccessor::PassAccessor;
+
+ protected:
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination,
+                                            const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+                                                  const RenderBuffers *render_buffers,
+                                                  const BufferParams &buffer_params,
+                                                  const Destination &destination,
+                                                  const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+                                                      const RenderBuffers *render_buffers,
+                                                      const BufferParams &buffer_params,
+                                                      const Destination &destination,
+                                                      const Processor &processor) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
new file mode 100644
index 00000000000..eb80ba99655
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_gpu.h"
+
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue,
+                                 const PassAccessInfo &pass_access_info,
+                                 float exposure,
+                                 int num_samples)
+    : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue)
+
+{
+}
+
+/* --------------------------------------------------------------------
+ * Kernel execution.
+ */
+
+void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
+                                               const RenderBuffers *render_buffers,
+                                               const BufferParams &buffer_params,
+                                               const Destination &destination) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  if (destination.d_pixels) {
+    DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+  if (destination.d_pixels_half_rgba) {
+    const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel_half_float, work_size, args);
+  }
+
+  queue_->synchronize();
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \
+  void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_film_convert_kernels( \
+        DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth, DEPTH);
+DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
+DEFINE_PASS_ACCESSOR(float, FLOAT);
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(float3, FLOAT3);
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion, MOTION);
+DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE);
+DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER);
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW);
+DEFINE_PASS_ACCESSOR(combined, COMBINED);
+DEFINE_PASS_ACCESSOR(float4, FLOAT4);
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h
new file mode 100644
index 00000000000..bc37e4387f3
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+/* Pass accessor implementation for GPU side. */
+class PassAccessorGPU : public PassAccessor {
+ public:
+  PassAccessorGPU(DeviceQueue *queue,
+                  const PassAccessInfo &pass_access_info,
+                  float exposure,
+                  int num_samples);
+
+ protected:
+  void run_film_convert_kernels(DeviceKernel kernel,
+                                const RenderBuffers *render_buffers,
+                                const BufferParams &buffer_params,
+                                const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth);
+  DECLARE_PASS_ACCESSOR(mist);
+  DECLARE_PASS_ACCESSOR(sample_count);
+  DECLARE_PASS_ACCESSOR(float);
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path);
+  DECLARE_PASS_ACCESSOR(float3);
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion);
+  DECLARE_PASS_ACCESSOR(cryptomatte);
+  DECLARE_PASS_ACCESSOR(shadow_catcher);
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow);
+  DECLARE_PASS_ACCESSOR(combined);
+  DECLARE_PASS_ACCESSOR(float4);
+
+#undef DECLARE_PASS_ACCESSOR
+
+  DeviceQueue *queue_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
new file mode 100644
index 00000000000..6c02316ac2b
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -0,0 +1,1147 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace.h"
+
+#include "device/cpu/device.h"
+#include "device/device.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/render_scheduler.h"
+#include "render/gpu_display.h"
+#include "render/pass.h"
+#include "render/scene.h"
+#include "render/tile.h"
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTrace::PathTrace(Device *device,
+                     Film *film,
+                     DeviceScene *device_scene,
+                     RenderScheduler &render_scheduler,
+                     TileManager &tile_manager)
+    : device_(device),
+      device_scene_(device_scene),
+      render_scheduler_(render_scheduler),
+      tile_manager_(tile_manager)
+{
+  DCHECK_NE(device_, nullptr);
+
+  {
+    vector<DeviceInfo> cpu_devices;
+    device_cpu_info(cpu_devices);
+
+    cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler));
+  }
+
+  /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
+   * as possible. */
+  device_->foreach_device([&](Device *path_trace_device) {
+    path_trace_works_.emplace_back(PathTraceWork::create(
+        path_trace_device, film, device_scene, &render_cancel_.is_requested));
+  });
+
+  work_balance_infos_.resize(path_trace_works_.size());
+  work_balance_do_initial(work_balance_infos_);
+
+  render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1);
+}
+
+PathTrace::~PathTrace()
+{
+  /* Destroy any GPU resource which was used for graphics interop.
+   * Need to have access to the GPUDisplay as it is the only source of drawing context which is
+   * used for interop. */
+  if (gpu_display_) {
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->destroy_gpu_resources(gpu_display_.get());
+    }
+  }
+}
+
+void PathTrace::load_kernels()
+{
+  if (denoiser_) {
+    denoiser_->load_kernels(progress_);
+  }
+}
+
+void PathTrace::alloc_work_memory()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->alloc_work_memory();
+  }
+}
+
+bool PathTrace::ready_to_reset()
+{
+  /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
+   * display. Of there is no such display, the logic here will break. */
+  DCHECK(gpu_display_);
+
+  /* The logic here tries to provide behavior which feels the most interactive feel to artists.
+   * General idea is to be able to reset as quickly as possible, while still providing interactive
+   * feel.
+   *
+   * If the render result was ever drawn after previous reset, consider that reset is now possible.
+   * This way camera navigation gives the quickest feedback of rendered pixels, regardless of
+   * whether CPU or GPU drawing pipeline is used.
+   *
+   * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit
+   * arbitrary, but seems to work very well with viewport navigation in Blender. */
+
+  if (did_draw_after_reset_) {
+    return true;
+  }
+
+  return false;
+}
+
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+{
+  if (big_tile_params_.modified(big_tile_params)) {
+    big_tile_params_ = big_tile_params;
+    render_state_.need_reset_params = true;
+  }
+
+  full_params_ = full_params;
+
+  /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+   * properly updated. */
+  if (gpu_display_) {
+    gpu_display_->reset(full_params);
+  }
+
+  render_state_.has_denoised_result = false;
+  render_state_.tile_written = false;
+
+  did_draw_after_reset_ = false;
+}
+
+void PathTrace::device_free()
+{
+  /* Free render buffers used by the path trace work to reduce memory peak. */
+  BufferParams empty_params;
+  empty_params.pass_stride = 0;
+  empty_params.update_offset_stride();
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->get_render_buffers()->reset(empty_params);
+  }
+  render_state_.need_reset_params = true;
+}
+
+void PathTrace::set_progress(Progress *progress)
+{
+  progress_ = progress;
+}
+
+void PathTrace::render(const RenderWork &render_work)
+{
+  /* Indicate that rendering has started and that it can be requested to cancel. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    if (render_cancel_.is_requested) {
+      return;
+    }
+    render_cancel_.is_rendering = true;
+  }
+
+  render_pipeline(render_work);
+
+  /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry
+   * on. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    render_cancel_.is_rendering = false;
+    render_cancel_.condition.notify_one();
+  }
+}
+
+void PathTrace::render_pipeline(RenderWork render_work)
+{
+  /* NOTE: Only check for "instant" cancel here. Ther user-requested cancel via progress is
+   * checked in Session and the work in the event of cancel is to be finished here. */
+
+  render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes !=
+                                                  0);
+
+  render_init_kernel_execution();
+
+  render_scheduler_.report_work_begin(render_work);
+
+  init_render_buffers(render_work);
+
+  rebalance(render_work);
+
+  path_trace(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  adaptive_sample(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  cryptomatte_postprocess(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  denoise(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  write_tile_buffer(render_work);
+  update_display(render_work);
+
+  progress_update_if_needed(render_work);
+
+  finalize_full_buffer_on_disk(render_work);
+}
+
+void PathTrace::render_init_kernel_execution()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->init_execution();
+  }
+}
+
+/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a
+ * measurable performance impact at runtime, but will make compilation faster and binary somewhat
+ * smaller. */
+template<typename Callback>
+static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works,
+                                         const vector<WorkBalanceInfo> &work_balance_infos,
+                                         const BufferParams &buffer_params,
+                                         const Callback &callback)
+{
+  const int num_works = path_trace_works.size();
+  const int height = buffer_params.height;
+
+  int current_y = 0;
+  for (int i = 0; i < num_works; ++i) {
+    const double weight = work_balance_infos[i].weight;
+    const int slice_height = max(lround(height * weight), 1);
+
+    /* Disallow negative values to deal with situations when there are more compute devices than
+     * scanlines. */
+    const int remaining_height = max(0, height - current_y);
+
+    BufferParams slide_params = buffer_params;
+    slide_params.full_y = buffer_params.full_y + current_y;
+    if (i < num_works - 1) {
+      slide_params.height = min(slice_height, remaining_height);
+    }
+    else {
+      slide_params.height = remaining_height;
+    }
+
+    slide_params.update_offset_stride();
+
+    callback(path_trace_works[i].get(), slide_params);
+
+    current_y += slide_params.height;
+  }
+}
+
+void PathTrace::update_allocated_work_buffer_params()
+{
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               big_tile_params_,
+                               [](PathTraceWork *path_trace_work, const BufferParams &params) {
+                                 RenderBuffers *buffers = path_trace_work->get_render_buffers();
+                                 buffers->reset(params);
+                               });
+}
+
+static BufferParams scale_buffer_params(const BufferParams &params, int resolution_divider)
+{
+  BufferParams scaled_params = params;
+
+  scaled_params.width = max(1, params.width / resolution_divider);
+  scaled_params.height = max(1, params.height / resolution_divider);
+  scaled_params.full_x = params.full_x / resolution_divider;
+  scaled_params.full_y = params.full_y / resolution_divider;
+  scaled_params.full_width = params.full_width / resolution_divider;
+  scaled_params.full_height = params.full_height / resolution_divider;
+
+  scaled_params.update_offset_stride();
+
+  return scaled_params;
+}
+
+void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work)
+{
+  const int resolution_divider = render_work.resolution_divider;
+
+  const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
+  const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
+                                                                  resolution_divider);
+
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               scaled_big_tile_params,
+                               [&](PathTraceWork *path_trace_work, const BufferParams params) {
+                                 path_trace_work->set_effective_buffer_params(
+                                     scaled_full_params, scaled_big_tile_params, params);
+                               });
+
+  render_state_.effective_big_tile_params = scaled_big_tile_params;
+}
+
+void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work)
+{
+  if (render_state_.need_reset_params) {
+    update_allocated_work_buffer_params();
+  }
+
+  if (render_state_.need_reset_params ||
+      render_state_.resolution_divider != render_work.resolution_divider) {
+    update_effective_work_buffer_params(render_work);
+  }
+
+  render_state_.resolution_divider = render_work.resolution_divider;
+  render_state_.need_reset_params = false;
+}
+
+void PathTrace::init_render_buffers(const RenderWork &render_work)
+{
+  update_work_buffer_params_if_needed(render_work);
+
+  /* Handle initialization scheduled by the render scheduler. */
+  if (render_work.init_render_buffers) {
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->zero_render_buffers();
+    });
+
+    tile_buffer_read();
+  }
+}
+
+void PathTrace::path_trace(RenderWork &render_work)
+{
+  if (!render_work.path_trace.num_samples) {
+    return;
+  }
+
+  VLOG(3) << "Will path trace " << render_work.path_trace.num_samples
+          << " samples at the resolution divider " << render_work.resolution_divider;
+
+  const double start_time = time_dt();
+
+  const int num_works = path_trace_works_.size();
+
+  tbb::parallel_for(0, num_works, [&](int i) {
+    const double work_start_time = time_dt();
+    const int num_samples = render_work.path_trace.num_samples;
+
+    PathTraceWork *path_trace_work = path_trace_works_[i].get();
+
+    PathTraceWork::RenderStatistics statistics;
+    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+    const double work_time = time_dt() - work_start_time;
+    work_balance_infos_[i].time_spent += work_time;
+    work_balance_infos_[i].occupancy = statistics.occupancy;
+
+    VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+            << work_time / num_samples
+            << " seconds per sample), occupancy: " << statistics.occupancy;
+  });
+
+  float occupancy_accum = 0.0f;
+  for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+    occupancy_accum += balance_info.occupancy;
+  }
+  const float occupancy = occupancy_accum / num_works;
+  render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
+  render_scheduler_.report_path_trace_time(
+      render_work, time_dt() - start_time, is_cancel_requested());
+}
+
+void PathTrace::adaptive_sample(RenderWork &render_work)
+{
+  if (!render_work.adaptive_sampling.filter) {
+    return;
+  }
+
+  bool did_reschedule_on_idle = false;
+
+  while (true) {
+    VLOG(3) << "Will filter adaptive stopping buffer, threshold "
+            << render_work.adaptive_sampling.threshold;
+    if (render_work.adaptive_sampling.reset) {
+      VLOG(3) << "Will re-calculate convergency flag for currently converged pixels.";
+    }
+
+    const double start_time = time_dt();
+
+    uint num_active_pixels = 0;
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      const uint num_active_pixels_in_work =
+          path_trace_work->adaptive_sampling_converge_filter_count_active(
+              render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset);
+      if (num_active_pixels_in_work) {
+        atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work);
+      }
+    });
+
+    render_scheduler_.report_adaptive_filter_time(
+        render_work, time_dt() - start_time, is_cancel_requested());
+
+    if (num_active_pixels == 0) {
+      VLOG(3) << "All pixels converged.";
+      if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) {
+        break;
+      }
+      VLOG(3) << "Continuing with lower threshold.";
+    }
+    else if (did_reschedule_on_idle) {
+      break;
+    }
+    else if (num_active_pixels < 128 * 128) {
+      /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that
+       * there is no performance loss from the progressive noise floor feature.
+       *
+       * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of
+       * the final resolution. */
+      if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) {
+        VLOG(3) << "Rescheduling is not possible: final threshold is reached.";
+        break;
+      }
+      VLOG(3) << "Rescheduling lower threshold.";
+      did_reschedule_on_idle = true;
+    }
+    else {
+      break;
+    }
+  }
+}
+
+void PathTrace::set_denoiser_params(const DenoiseParams &params)
+{
+  render_scheduler_.set_denoiser_params(params);
+
+  if (!params.use) {
+    denoiser_.reset();
+    return;
+  }
+
+  if (denoiser_) {
+    const DenoiseParams old_denoiser_params = denoiser_->get_params();
+    if (old_denoiser_params.type == params.type) {
+      denoiser_->set_params(params);
+      return;
+    }
+  }
+
+  denoiser_ = Denoiser::create(device_, params);
+  denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
+}
+
+void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  render_scheduler_.set_adaptive_sampling(adaptive_sampling);
+}
+
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+  if (!render_work.cryptomatte.postprocess) {
+    return;
+  }
+  VLOG(3) << "Perform cryptomatte work.";
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->cryptomatte_postproces();
+  });
+}
+
+void PathTrace::denoise(const RenderWork &render_work)
+{
+  if (!render_work.tile.denoise) {
+    return;
+  }
+
+  if (!denoiser_) {
+    /* Denoiser was not configured, so nothing to do here. */
+    return;
+  }
+
+  VLOG(3) << "Perform denoising work.";
+
+  const double start_time = time_dt();
+
+  RenderBuffers *buffer_to_denoise = nullptr;
+
+  unique_ptr<RenderBuffers> multi_device_buffers;
+  bool allow_inplace_modification = false;
+
+  if (path_trace_works_.size() == 1) {
+    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  }
+  else {
+    Device *denoiser_device = denoiser_->get_denoiser_device();
+    if (!denoiser_device) {
+      return;
+    }
+
+    multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
+    multi_device_buffers->reset(render_state_.effective_big_tile_params);
+
+    buffer_to_denoise = multi_device_buffers.get();
+
+    copy_to_render_buffers(multi_device_buffers.get());
+
+    allow_inplace_modification = true;
+  }
+
+  if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
+                                buffer_to_denoise,
+                                get_num_samples_in_buffer(),
+                                allow_inplace_modification)) {
+    render_state_.has_denoised_result = true;
+  }
+
+  if (multi_device_buffers) {
+    multi_device_buffers->copy_from_device();
+    tbb::parallel_for_each(
+        path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+          path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
+        });
+  }
+
+  render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+{
+  gpu_display_ = move(gpu_display);
+}
+
+void PathTrace::clear_gpu_display()
+{
+  if (gpu_display_) {
+    gpu_display_->clear();
+  }
+}
+
+void PathTrace::draw()
+{
+  if (!gpu_display_) {
+    return;
+  }
+
+  did_draw_after_reset_ |= gpu_display_->draw();
+}
+
+void PathTrace::update_display(const RenderWork &render_work)
+{
+  if (!render_work.display.update) {
+    return;
+  }
+
+  if (!gpu_display_ && !tile_buffer_update_cb) {
+    VLOG(3) << "Ignore display update.";
+    return;
+  }
+
+  if (full_params_.width == 0 || full_params_.height == 0) {
+    VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (tile_buffer_update_cb) {
+    VLOG(3) << "Invoke buffer update callback.";
+
+    tile_buffer_update_cb();
+  }
+
+  if (gpu_display_) {
+    VLOG(3) << "Perform copy to GPUDisplay work.";
+
+    const int resolution_divider = render_work.resolution_divider;
+    const int texture_width = max(1, full_params_.width / resolution_divider);
+    const int texture_height = max(1, full_params_.height / resolution_divider);
+    if (!gpu_display_->update_begin(texture_width, texture_height)) {
+      LOG(ERROR) << "Error beginning GPUDisplay update.";
+      return;
+    }
+
+    const PassMode pass_mode = render_work.display.use_denoised_result &&
+                                       render_state_.has_denoised_result ?
+                                   PassMode::DENOISED :
+                                   PassMode::NOISY;
+
+    /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
+     * all works in parallel. */
+    const int num_samples = get_num_samples_in_buffer();
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+    }
+
+    gpu_display_->update_end();
+  }
+
+  render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::rebalance(const RenderWork &render_work)
+{
+  static const int kLogLevel = 3;
+
+  if (!render_work.rebalance) {
+    return;
+  }
+
+  const int num_works = path_trace_works_.size();
+
+  if (num_works == 1) {
+    VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Perform rebalance work.";
+    VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].time_spent;
+    }
+  }
+
+  const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Calculated per-device weights for works:";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].weight;
+    }
+  }
+
+  if (!did_rebalance) {
+    VLOG(kLogLevel) << "Balance in path trace works did not change.";
+    render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false);
+    return;
+  }
+
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+  big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+
+  copy_to_render_buffers(&big_tile_cpu_buffers);
+
+  render_state_.need_reset_params = true;
+  update_work_buffer_params_if_needed(render_work);
+
+  copy_from_render_buffers(&big_tile_cpu_buffers);
+
+  render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true);
+}
+
+void PathTrace::write_tile_buffer(const RenderWork &render_work)
+{
+  if (!render_work.tile.write) {
+    return;
+  }
+
+  VLOG(3) << "Write tile result.";
+
+  render_state_.tile_written = true;
+
+  const bool has_multiple_tiles = tile_manager_.has_multiple_tiles();
+
+  /* Write render tile result, but only if not using tiled rendering.
+   *
+   * Tiles are written to a file during rendering, and written to the software at the end
+   * of rendering (wither when all tiles are finished, or when rendering was requested to be
+   * cancelled).
+   *
+   * Important thing is: tile should be written to the software via callback only once. */
+  if (!has_multiple_tiles) {
+    VLOG(3) << "Write tile result via buffer write callback.";
+    tile_buffer_write();
+  }
+
+  /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
+   */
+  if (has_multiple_tiles) {
+    VLOG(3) << "Write tile result into .";
+    tile_buffer_write_to_disk();
+  }
+}
+
+void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work)
+{
+  if (!render_work.full.write) {
+    return;
+  }
+
+  VLOG(3) << "Handle full-frame render buffer work.";
+
+  if (!tile_manager_.has_written_tiles()) {
+    VLOG(3) << "No tiles on disk.";
+    return;
+  }
+
+  /* Make sure writing to the file is fully finished.
+   * This will include writing all possible missing tiles, ensuring validness of the file. */
+  tile_manager_.finish_write_tiles();
+
+  /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after
+   * all scenes and layers are rendered by the Session (which happens after freeing Session memory,
+   * so that we never hold scene and full-frame buffer in memory at the same time). */
+}
+
+void PathTrace::cancel()
+{
+  thread_scoped_lock lock(render_cancel_.mutex);
+
+  render_cancel_.is_requested = true;
+
+  while (render_cancel_.is_rendering) {
+    render_cancel_.condition.wait(lock);
+  }
+
+  render_cancel_.is_requested = false;
+}
+
+int PathTrace::get_num_samples_in_buffer()
+{
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::is_cancel_requested()
+{
+  if (render_cancel_.is_requested) {
+    return true;
+  }
+
+  if (progress_ != nullptr) {
+    if (progress_->get_cancel()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void PathTrace::tile_buffer_write()
+{
+  if (!tile_buffer_write_cb) {
+    return;
+  }
+
+  tile_buffer_write_cb();
+}
+
+void PathTrace::tile_buffer_read()
+{
+  if (!tile_buffer_read_cb) {
+    return;
+  }
+
+  if (tile_buffer_read_cb()) {
+    tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->copy_render_buffers_to_device();
+    });
+  }
+}
+
+void PathTrace::tile_buffer_write_to_disk()
+{
+  /* Sample count pass is required to support per-tile partial results stored in the file. */
+  DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED);
+
+  const int num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+
+  if (num_rendered_samples == 0) {
+    /* The tile has zero samples, no need to write it. */
+    return;
+  }
+
+  /* Get access to the CPU-side render buffers of the current big tile. */
+  RenderBuffers *buffers;
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+
+  if (path_trace_works_.size() == 1) {
+    path_trace_works_[0]->copy_render_buffers_from_device();
+    buffers = path_trace_works_[0]->get_render_buffers();
+  }
+  else {
+    big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+    copy_to_render_buffers(&big_tile_cpu_buffers);
+
+    buffers = &big_tile_cpu_buffers;
+  }
+
+  if (!tile_manager_.write_tile(*buffers)) {
+    LOG(ERROR) << "Error writing tile to file.";
+  }
+}
+
+void PathTrace::progress_update_if_needed(const RenderWork &render_work)
+{
+  if (progress_ != nullptr) {
+    const int2 tile_size = get_render_tile_size();
+    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+    const int current_sample = render_work.path_trace.start_sample +
+                               render_work.path_trace.num_samples;
+    progress_->add_samples(num_samples_added, current_sample);
+  }
+
+  if (progress_update_cb) {
+    progress_update_cb();
+  }
+}
+
+void PathTrace::progress_set_status(const string &status, const string &substatus)
+{
+  if (progress_ != nullptr) {
+    progress_->set_status(status, substatus);
+  }
+}
+
+void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_to_render_buffers(render_buffers);
+                         });
+  render_buffers->copy_to_device();
+}
+
+void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers)
+{
+  render_buffers->copy_from_device();
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_from_render_buffers(render_buffers);
+                         });
+}
+
+bool PathTrace::copy_render_tile_from_device()
+{
+  if (full_frame_state_.render_buffers) {
+    /* Full-frame buffer is always allocated on CPU. */
+    return true;
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->copy_render_buffers_from_device()) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+static string get_layer_view_name(const RenderBuffers &buffers)
+{
+  string result;
+
+  if (buffers.params.layer.size()) {
+    result += string(buffers.params.layer);
+  }
+
+  if (buffers.params.view.size()) {
+    if (!result.empty()) {
+      result += ", ";
+    }
+    result += string(buffers.params.view);
+  }
+
+  return result;
+}
+
+void PathTrace::process_full_buffer_from_disk(string_view filename)
+{
+  VLOG(3) << "Processing full frame buffer file " << filename;
+
+  progress_set_status("Reading full buffer from disk");
+
+  RenderBuffers full_frame_buffers(cpu_device_.get());
+
+  DenoiseParams denoise_params;
+  if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
+    LOG(ERROR) << "Error reading tiles from file.";
+    return;
+  }
+
+  const string layer_view_name = get_layer_view_name(full_frame_buffers);
+
+  render_state_.has_denoised_result = false;
+
+  if (denoise_params.use) {
+    progress_set_status(layer_view_name, "Denoising");
+
+    /* Re-use the denoiser as much as possible, avoiding possible device re-initialization.
+     *
+     * It will not conflict with the regular rendering as:
+     *  - Rendering is supposed to be finished here.
+     *  - The next rendering will go via Session's `run_update_for_next_iteration` which will
+     *    ensure proper denoiser is used. */
+    set_denoiser_params(denoise_params);
+
+    /* Number of samples doesn't matter too much, since the sampels count pass will be used. */
+    denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false);
+
+    render_state_.has_denoised_result = true;
+  }
+
+  full_frame_state_.render_buffers = &full_frame_buffers;
+
+  progress_set_status(layer_view_name, "Finishing");
+
+  /* Write the full result pretending that there is a single tile.
+   * Requires some state change, but allows to use same communication API with the software. */
+  tile_buffer_write();
+
+  full_frame_state_.render_buffers = nullptr;
+}
+
+int PathTrace::get_num_render_tile_samples() const
+{
+  if (full_frame_state_.render_buffers) {
+    /* If the full-frame buffer is read from disk the number of samples is not used as there is a
+     * sample count pass for that in the buffer. Just avoid access to badly defined state of the
+     * path state. */
+    return 0;
+  }
+
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                       const PassAccessor::Destination &destination)
+{
+  if (full_frame_state_.render_buffers) {
+    return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                       const PassAccessor::Source &source)
+{
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+int2 PathTrace::get_render_tile_size() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(full_frame_state_.render_buffers->params.width,
+                     full_frame_state_.render_buffers->params.height);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.width, tile.height);
+}
+
+int2 PathTrace::get_render_tile_offset() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(0, 0);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.x, tile.y);
+}
+
+const BufferParams &PathTrace::get_render_tile_params() const
+{
+  if (full_frame_state_.render_buffers) {
+    return full_frame_state_.render_buffers->params;
+  }
+
+  return big_tile_params_;
+}
+
+bool PathTrace::has_denoised_result() const
+{
+  return render_state_.has_denoised_result;
+}
+
+/* --------------------------------------------------------------------
+ * Report generation.
+ */
+
+static const char *device_type_for_description(const DeviceType type)
+{
+  switch (type) {
+    case DEVICE_NONE:
+      return "None";
+
+    case DEVICE_CPU:
+      return "CPU";
+    case DEVICE_CUDA:
+      return "CUDA";
+    case DEVICE_OPTIX:
+      return "OptiX";
+    case DEVICE_DUMMY:
+      return "Dummy";
+    case DEVICE_MULTI:
+      return "Multi";
+  }
+
+  return "UNKNOWN";
+}
+
+/* Construct description of the device which will appear in the full report. */
+/* TODO(sergey): Consider making it more reusable utility. */
+static string full_device_info_description(const DeviceInfo &device_info)
+{
+  string full_description = device_info.description;
+
+  full_description += " (" + string(device_type_for_description(device_info.type)) + ")";
+
+  if (device_info.display_device) {
+    full_description += " (display)";
+  }
+
+  if (device_info.type == DEVICE_CPU) {
+    full_description += " (" + to_string(device_info.cpu_threads) + " threads)";
+  }
+
+  full_description += " [" + device_info.id + "]";
+
+  return full_description;
+}
+
+/* Construct string which will contain information about devices, possibly multiple of the devices.
+ *
+ * In the simple case the result looks like:
+ *
+ *   Message: Full Device Description
+ *
+ * If there are multiple devices then the result looks like:
+ *
+ *   Message: Full First Device Description
+ *            Full Second Device Description
+ *
+ * Note that the newlines are placed in a way so that the result can be easily concatenated to the
+ * full report. */
+static string device_info_list_report(const string &message, const DeviceInfo &device_info)
+{
+  string result = "\n" + message + ": ";
+  const string pad(message.length() + 2, ' ');
+
+  if (device_info.multi_devices.empty()) {
+    result += full_device_info_description(device_info) + "\n";
+    return result;
+  }
+
+  bool is_first = true;
+  for (const DeviceInfo &sub_device_info : device_info.multi_devices) {
+    if (!is_first) {
+      result += pad;
+    }
+
+    result += full_device_info_description(sub_device_info) + "\n";
+
+    is_first = false;
+  }
+
+  return result;
+}
+
+static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works)
+{
+  DeviceInfo device_info;
+  device_info.type = DEVICE_MULTI;
+
+  for (auto &&path_trace_work : path_trace_works) {
+    device_info.multi_devices.push_back(path_trace_work->get_device()->info);
+  }
+
+  return device_info_list_report("Path tracing on", device_info);
+}
+
+static string denoiser_device_report(const Denoiser *denoiser)
+{
+  if (!denoiser) {
+    return "";
+  }
+
+  if (!denoiser->get_params().use) {
+    return "";
+  }
+
+  const Device *denoiser_device = denoiser->get_denoiser_device();
+  if (!denoiser_device) {
+    return "";
+  }
+
+  return device_info_list_report("Denoising on", denoiser_device->info);
+}
+
+string PathTrace::full_report() const
+{
+  string result = "\nFull path tracing report\n";
+
+  result += path_trace_devices_report(path_trace_works_);
+  result += denoiser_device_report(denoiser_.get());
+
+  /* Report from the render scheduler, which includes:
+   * - Render mode (interactive, offline, headless)
+   * - Adaptive sampling and denoiser parameters
+   * - Breakdown of timing. */
+  result += render_scheduler_.full_report();
+
+  return result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
new file mode 100644
index 00000000000..78ca68c1198
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/path_trace_work.h"
+#include "integrator/work_balancer.h"
+#include "render/buffers.h"
+#include "util/util_function.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling;
+class Device;
+class DeviceScene;
+class Film;
+class RenderBuffers;
+class RenderScheduler;
+class RenderWork;
+class Progress;
+class GPUDisplay;
+class TileManager;
+
+/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
+ * all the common steps of path tracing which are not device-specific. The list of tasks includes
+ * but is not limited to:
+ *  - Kernel graph.
+ *  - Scheduling logic.
+ *  - Queues management.
+ *  - Adaptive stopping. */
+class PathTrace {
+ public:
+  /* Render scheduler is used to report timing information and access things like start/finish
+   * sample. */
+  PathTrace(Device *device,
+            Film *film,
+            DeviceScene *device_scene,
+            RenderScheduler &render_scheduler,
+            TileManager &tile_manager);
+  ~PathTrace();
+
+  /* Create devices and load kernels which are created on-demand (for example, denoising devices).
+   * The progress is reported to the currently configure progress object (via `set_progress`). */
+  void load_kernels();
+
+  /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+   * more accurately which scene device memory may need to allocated on the host. */
+  void alloc_work_memory();
+
+  /* Check whether now it is a good time to reset rendering.
+   * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
+   * render result. */
+  bool ready_to_reset();
+
+  void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+
+  void device_free();
+
+  /* Set progress tracker.
+   * Used to communicate details about the progress to the outer world, check whether rendering is
+   * to be canceled.
+   *
+   * The path tracer writes to this object, and then at a convenient moment runs
+   * progress_update_cb() callback. */
+  void set_progress(Progress *progress);
+
+  /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are
+   * rendered (or until rendering is requested to be cancelled). */
+  void render(const RenderWork &render_work);
+
+  /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is
+   * convenient to have it here because then its easy to access render buffer. But the downside is
+   * that this adds too much of entities which can live separately with some clear API. */
+
+  /* Set denoiser parameters.
+   * Use this to configure the denoiser before rendering any samples. */
+  void set_denoiser_params(const DenoiseParams &params);
+
+  /* Set parameters used for adaptive sampling.
+   * Use this to configure the adaptive sampler before rendering any samples. */
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  /* Set GPU display which takes care of drawing the render result. */
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+
+  /* Clear the GPU display by filling it in with all zeroes. */
+  void clear_gpu_display();
+
+  /* Perform drawing of the current state of the GPUDisplay. */
+  void draw();
+
+  /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
+   * Used in cases like reset of render session.
+   *
+   * This is a blockign call, which returns as soon as there is no running `render_samples()` call.
+   */
+  void cancel();
+
+  /* Copy an entire render buffer to/from the path trace.  */
+
+  /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and
+   * the data will be copied to the device of the given render buffers. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy happens via CPU side buffer: data will be copied from the device of the given rendetr
+   * buffers and will be copied to all devices of the path trace. */
+  void copy_from_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy render buffers of the big tile from the device to hsot.
+   * Return true if all copies are successful. */
+  bool copy_render_tile_from_device();
+
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
+
+  /* Get number of samples in the current big tile render buffers. */
+  int get_num_render_tile_samples() const;
+
+  /* Get pass data of the entire big tile.
+   * This call puts pass render result from all devices into the final pixels storage.
+   *
+   * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`.
+   *
+   * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Check whether denoiser was run and denoised passes are available. */
+  bool has_denoised_result() const;
+
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile.
+   * In the case of tiled rendering this will return full-frame after all tiles has been rendered.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
+
+  /* Get buffer parameters of the current tile.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  const BufferParams &get_render_tile_params() const;
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+  /* Callback which communicates an updates state of the render buffer of the current big tile.
+   * Is called during path tracing to communicate work-in-progress state of the final buffer. */
+  function<void(void)> tile_buffer_update_cb;
+
+  /* Callback which communicates final rendered buffer. Is called after pathtracing is done. */
+  function<void(void)> tile_buffer_write_cb;
+
+  /* Callback which initializes rendered buffer. Is called before pathtracing starts.
+   *
+   * This is used for baking. */
+  function<bool(void)> tile_buffer_read_cb;
+
+  /* Callback which is called to report current rendering progress.
+   *
+   * It is supposed to be cheaper than buffer update/write, hence can be called more often.
+   * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed
+   * that the buffer is "uniformly" sampled at the moment of this callback). */
+  function<void(void)> progress_update_cb;
+
+ protected:
+  /* Actual implementation of the rendering pipeline.
+   * Calls steps in order, checking for the cancel to be requested inbetween.
+   *
+   * Is separate from `render()` to simplify dealing with the early outputs and keeping
+   * `render_cancel_` in the consistent state. */
+  void render_pipeline(RenderWork render_work);
+
+  /* Initialize kernel execution on all integrator queues. */
+  void render_init_kernel_execution();
+
+  /* Make sure both allocated and effective buffer parameters of path tracer works are up to date
+   * with the current big tile parameters, performance-dependent slicing, and resolution divider.
+   */
+  void update_work_buffer_params_if_needed(const RenderWork &render_work);
+  void update_allocated_work_buffer_params();
+  void update_effective_work_buffer_params(const RenderWork &render_work);
+
+  /* Perform various steps of the render work.
+   *
+   * Note that some steps might modify the work, forcing some steps to happen within this iteration
+   * of rendering. */
+  void init_render_buffers(const RenderWork &render_work);
+  void path_trace(RenderWork &render_work);
+  void adaptive_sample(RenderWork &render_work);
+  void denoise(const RenderWork &render_work);
+  void cryptomatte_postprocess(const RenderWork &render_work);
+  void update_display(const RenderWork &render_work);
+  void rebalance(const RenderWork &render_work);
+  void write_tile_buffer(const RenderWork &render_work);
+  void finalize_full_buffer_on_disk(const RenderWork &render_work);
+
+  /* Get number of samples in the current state of the render buffers. */
+  int get_num_samples_in_buffer();
+
+  /* Check whether user requested to cancel rendering, so that path tracing is to be finished as
+   * soon as possible. */
+  bool is_cancel_requested();
+
+  /* Write the big tile render buffer via the write callback. */
+  void tile_buffer_write();
+
+  /* Read the big tile render buffer via the read callback. */
+  void tile_buffer_read();
+
+  /* Write current tile into the file on disk. */
+  void tile_buffer_write_to_disk();
+
+  /* Run the progress_update_cb callback if it is needed. */
+  void progress_update_if_needed(const RenderWork &render_work);
+
+  void progress_set_status(const string &status, const string &substatus = "");
+
+  /* Pointer to a device which is configured to be used for path tracing. If multiple devices
+   * are configured this is a `MultiDevice`. */
+  Device *device_ = nullptr;
+
+  /* CPU device for creating temporary render buffers on the CPU side. */
+  unique_ptr<Device> cpu_device_;
+
+  DeviceScene *device_scene_;
+
+  RenderScheduler &render_scheduler_;
+  TileManager &tile_manager_;
+
+  unique_ptr<GPUDisplay> gpu_display_;
+
+  /* Per-compute device descriptors of work which is responsible for path tracing on its configured
+   * device. */
+  vector<unique_ptr<PathTraceWork>> path_trace_works_;
+
+  /* Per-path trace work information needed for multi-device balancing. */
+  vector<WorkBalanceInfo> work_balance_infos_;
+
+  /* Render buffer parameters of the full frame and current big tile. */
+  BufferParams full_params_;
+  BufferParams big_tile_params_;
+
+  /* Denoiser which takes care of denoising the big tile. */
+  unique_ptr<Denoiser> denoiser_;
+
+  /* State which is common for all the steps of the render work.
+   * Is brought up to date in the `render()` call and is accessed from all the steps involved into
+   * rendering the work. */
+  struct {
+    /* Denotes whether render buffers parameters of path trace works are to be reset for the new
+     * value of the big tile parameters. */
+    bool need_reset_params = false;
+
+    /* Divider of the resolution for faster previews.
+     *
+     * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to
+     * think of render buffer in this case is as an over-allocated array: the resolution divider
+     * affects both resolution and stride as visible by the integrator kernels. */
+    int resolution_divider = 0;
+
+    /* Paramaters of the big tile with the current resolution divider applied. */
+    BufferParams effective_big_tile_params;
+
+    /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+    bool has_denoised_result = false;
+
+    /* Current tile has been written (to either disk or callback.
+     * Indicates that no more work will be done on this tile. */
+    bool tile_written = false;
+  } render_state_;
+
+  /* Progress object which is used to communicate sample progress. */
+  Progress *progress_;
+
+  /* Fields required for canceling render on demand, as quickly as possible. */
+  struct {
+    /* Indicates whether there is an on-going `render_samples()` call. */
+    bool is_rendering = false;
+
+    /* Indicates whether rendering is requested to be canceled by `cancel()`. */
+    bool is_requested = false;
+
+    /* Synchronization between thread which does `render_samples()` and thread which does
+     * `cancel()`. */
+    thread_mutex mutex;
+    thread_condition_variable condition;
+  } render_cancel_;
+
+  /* Indicates whether a render result was drawn after latest session reset.
+   * Used by `ready_to_reset()` to implement logic which feels the most interactive. */
+  bool did_draw_after_reset_ = true;
+
+  /* State of the full frame processing and writing to the software. */
+  struct {
+    RenderBuffers *render_buffers = nullptr;
+  } full_frame_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
new file mode 100644
index 00000000000..d9634acac10
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/path_trace_work_cpu.h"
+#include "integrator/path_trace_work_gpu.h"
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
+                                                Film *film,
+                                                DeviceScene *device_scene,
+                                                bool *cancel_requested_flag)
+{
+  if (device->info.type == DEVICE_CPU) {
+    return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
+  }
+
+  return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
+}
+
+PathTraceWork::PathTraceWork(Device *device,
+                             Film *film,
+                             DeviceScene *device_scene,
+                             bool *cancel_requested_flag)
+    : device_(device),
+      film_(film),
+      device_scene_(device_scene),
+      buffers_(make_unique<RenderBuffers>(device)),
+      effective_buffer_params_(buffers_->params),
+      cancel_requested_flag_(cancel_requested_flag)
+{
+}
+
+PathTraceWork::~PathTraceWork()
+{
+}
+
+RenderBuffers *PathTraceWork::get_render_buffers()
+{
+  return buffers_.get();
+}
+
+void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params,
+                                                const BufferParams &effective_big_tile_params,
+                                                const BufferParams &effective_buffer_params)
+{
+  effective_full_params_ = effective_full_params;
+  effective_big_tile_params_ = effective_big_tile_params;
+  effective_buffer_params_ = effective_buffer_params;
+}
+
+bool PathTraceWork::has_multiple_works() const
+{
+  /* Assume if there are multiple works working on the same big tile none of the works gets the
+   * entire big tile to work on. */
+  return !(effective_big_tile_params_.width == effective_buffer_params_.width &&
+           effective_big_tile_params_.height == effective_buffer_params_.height &&
+           effective_big_tile_params_.full_x == effective_buffer_params_.full_x &&
+           effective_big_tile_params_.full_y == effective_buffer_params_.full_y);
+}
+
+void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  copy_render_buffers_from_device();
+
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = buffers_->buffer.data();
+  float *dst = render_buffers->buffer.data() + offset_in_floats;
+
+  memcpy(dst, src, data_size);
+}
+
+void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = render_buffers->buffer.data() + offset_in_floats;
+  float *dst = buffers_->buffer.data();
+
+  memcpy(dst, src, data_size);
+
+  copy_render_buffers_to_device();
+}
+
+void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset = offset_y * width;
+
+  render_buffers_host_copy_denoised(
+      buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+
+  copy_render_buffers_to_device();
+}
+
+bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                           const PassAccessor::Destination &destination)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Destination slice_destination = destination;
+  slice_destination.offset += offset_y * width;
+
+  return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+}
+
+bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                           const PassAccessor::Source &source)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Source slice_source = source;
+  slice_source.offset += offset_y * width;
+
+  return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source);
+}
+
+PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+  const KernelBackground &kbackground = device_scene_->data.background;
+
+  const BufferParams &params = buffers_->params;
+
+  const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass());
+
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = display_pass->type;
+  pass_access_info.offset = PASS_UNUSED;
+
+  if (pass_mode == PassMode::DENOISED) {
+    pass_access_info.mode = PassMode::DENOISED;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED);
+  }
+
+  if (pass_access_info.offset == PASS_UNUSED) {
+    pass_access_info.mode = PassMode::NOISY;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type);
+  }
+
+  pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      kfilm.use_approximate_shadow_catcher && !kbackground.transparent;
+
+  return pass_access_info;
+}
+
+PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
+    const GPUDisplay *gpu_display) const
+{
+  PassAccessor::Destination destination(film_->get_display_pass());
+
+  const int2 display_texture_size = gpu_display->get_texture_size();
+  const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
+  const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
+
+  destination.offset = texture_y * display_texture_size.x + texture_x;
+  destination.stride = display_texture_size.x;
+
+  return destination;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
new file mode 100644
index 00000000000..97b97f3d888
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "render/buffers.h"
+#include "render/pass.h"
+#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class DeviceScene;
+class Film;
+class GPUDisplay;
+class RenderBuffers;
+
+class PathTraceWork {
+ public:
+  struct RenderStatistics {
+    float occupancy = 1.0f;
+  };
+
+  /* Create path trace work which fits best the device.
+   *
+   * The cancel request flag is used for a cheap check whether cancel is to berformed as soon as
+   * possible. This could be, for rexample, request to cancel rendering on camera navigation in
+   * viewport. */
+  static unique_ptr<PathTraceWork> create(Device *device,
+                                          Film *film,
+                                          DeviceScene *device_scene,
+                                          bool *cancel_requested_flag);
+
+  virtual ~PathTraceWork();
+
+  /* Access the render buffers.
+   *
+   * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to
+   * correspond to the big tile size and relative device performance. */
+  RenderBuffers *get_render_buffers();
+
+  /* Set effective parameters of the big tile and the work itself. */
+  void set_effective_buffer_params(const BufferParams &effective_full_params,
+                                   const BufferParams &effective_big_tile_params,
+                                   const BufferParams &effective_buffer_params);
+
+  /* Check whether the big tile is being worked on by multiple path trace works. */
+  bool has_multiple_works() const;
+
+  /* Allocate working memory for execution. Must be called before init_execution(). */
+  virtual void alloc_work_memory(){};
+
+  /* Initialize execution of kernels.
+   * Will ensure that all device queues are initialized for execution.
+   *
+   * This method is to be called after any change in the scene. It is not needed to call it prior
+   * to an every call of the `render_samples()`. */
+  virtual void init_execution() = 0;
+
+  /* Render given number of samples as a synchronous blocking call.
+   * The samples are added to the render buffer associated with this work. */
+  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+
+  /* Copy render result from this work to the corresponding place of the GPU display.
+   *
+   * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The
+   * noisy pass mode will be passed here when it is known that the buffer does not have denoised
+   * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
+   * not used then this function will fall-back to the noisy pass instead. */
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) = 0;
+
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+
+  /* Copy data from/to given render buffers.
+   * Will copy pixels from a corresponding place (from multi-device point of view) of the render
+   * buffers, and copy work's render buffers to the corresponding place of the destination. */
+
+  /* Notes:
+   * - Copies work's render buffer from the device.
+   * - Copies CPU-side buffer of the given buffer
+   * - Does not copy the buffer to its device. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Notes:
+   * - Does not copy given render buffers from the device.
+   * - Copies work's render buffer to its device. */
+  void copy_from_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+   * given render buffers, leaving rest of the passes.
+   *
+   * Same notes about device copying aplies to this call as well. */
+  void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Copy render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool copy_render_buffers_from_device() = 0;
+  virtual bool copy_render_buffers_to_device() = 0;
+
+  /* Zero render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool zero_render_buffers() = 0;
+
+  /* Access pixels rendered by this work and copy them to the coresponding location in the
+   * destination.
+   *
+   * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()`
+   * to update host-side data. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Perform convergence test on the render buffer, and filter the convergence mask.
+   * Returns number of active pixels (the ones which did not converge yet). */
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
+
+  /* Run cryptomatte pass post-processing kernels. */
+  virtual void cryptomatte_postproces() = 0;
+
+  /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
+   * possible, without waiting for any samples to be finished. */
+  inline bool is_cancel_requested() const
+  {
+    /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in
+     * threaded environment. */
+    return *cancel_requested_flag_;
+  }
+
+  /* Access to the device which is used to path trace this work on. */
+  Device *get_device() const
+  {
+    return device_;
+  }
+
+ protected:
+  PathTraceWork(Device *device,
+                Film *film,
+                DeviceScene *device_scene,
+                bool *cancel_requested_flag);
+
+  PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const;
+
+  /* Get destination which offset and stride are configured so that writing to it will write to a
+   * proper location of GPU display texture, taking current tile and device slice into account. */
+  PassAccessor::Destination get_gpu_display_destination_template(
+      const GPUDisplay *gpu_display) const;
+
+  /* Device which will be used for path tracing.
+   * Note that it is an actual render device (and never is a multi-device). */
+  Device *device_;
+
+  /* Film is used to access display pass configuration for GPU display update.
+   * Note that only fields which are not a part of kernel data can be accessed via the Film. */
+  Film *film_;
+
+  /* Device side scene storage, that may be used for integrator logic. */
+  DeviceScene *device_scene_;
+
+  /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big
+   * tile which is being rendered by this work.
+   * It also defines possible subset of a big tile in the case of multi-device rendering. */
+  unique_ptr<RenderBuffers> buffers_;
+
+  /* Effective parameters of the full, big tile, and current work render buffer.
+   * The latter might be different from buffers_->params when there is a resolution divider
+   * involved. */
+  BufferParams effective_full_params_;
+  BufferParams effective_big_tile_params_;
+  BufferParams effective_buffer_params_;
+
+  bool *cancel_requested_flag_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+  /* TODO: limit this to number of threads of CPU device, it may be smaller than
+   * the system number of threads when we reduce the number of CPU threads in
+   * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+  return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  const int thread_index = tbb::this_task_arena::current_thread_index();
+  DCHECK_GE(thread_index, 0);
+  DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+  return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      kernels_(*(device->get_cpu_kernels()))
+{
+  DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+  /* Cache per-thread kernel globals. */
+  device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  const int64_t image_width = effective_buffer_params_.width;
+  const int64_t image_height = effective_buffer_params_.height;
+  const int64_t total_pixels_num = image_width * image_height;
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
+  }
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+      if (is_cancel_requested()) {
+        return;
+      }
+
+      const int y = work_index / image_width;
+      const int x = work_index - y * image_width;
+
+      KernelWorkTile work_tile;
+      work_tile.x = effective_buffer_params_.full_x + x;
+      work_tile.y = effective_buffer_params_.full_y + y;
+      work_tile.w = 1;
+      work_tile.h = 1;
+      work_tile.start_sample = start_sample;
+      work_tile.num_samples = 1;
+      work_tile.offset = effective_buffer_params_.offset;
+      work_tile.stride = effective_buffer_params_.stride;
+
+      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+    });
+  });
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
+  }
+
+  statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                                    const KernelWorkTile &work_tile,
+                                                    const int samples_num)
+{
+  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+  const bool has_bake = device_scene_->data.bake.use;
+
+  IntegratorStateCPU integrator_states[2] = {};
+
+  IntegratorStateCPU *state = &integrator_states[0];
+  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+  KernelWorkTile sample_work_tile = work_tile;
+  float *render_buffer = buffers_->buffer.data();
+
+  for (int sample = 0; sample < samples_num; ++sample) {
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    if (has_bake) {
+      if (!kernels_.integrator_init_from_bake(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+    else {
+      if (!kernels_.integrator_init_from_camera(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+
+    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+    if (has_shadow_catcher) {
+      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+    }
+
+    ++sample_work_tile.start_sample;
+  }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  half4 *rgba_half = gpu_display->map_texture_buffer();
+  if (!rgba_half) {
+    /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+     * some implementations of GPUDisplay which can not map memory? */
+    return;
+  }
+
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.pixels_half_rgba = rgba_half;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+  });
+
+  gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+  return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+  buffers_->buffer.copy_to_device();
+  return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+  buffers_->zero();
+  return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int offset = effective_buffer_params_.offset;
+  const int stride = effective_buffer_params_.stride;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  uint num_active_pixels = 0;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(full_y, full_y + height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+      bool row_converged = true;
+      uint num_row_pixels_active = 0;
+      for (int x = 0; x < width; ++x) {
+        if (!kernels_.adaptive_sampling_convergence_check(
+                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+          ++num_row_pixels_active;
+          row_converged = false;
+        }
+      }
+
+      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+      if (!row_converged) {
+        kernels_.adaptive_sampling_filter_x(
+            kernel_globals, render_buffer, y, full_x, width, offset, stride);
+      }
+    });
+  });
+
+  if (num_active_pixels) {
+    local_arena.execute([&]() {
+      tbb::parallel_for(full_x, full_x + width, [&](int x) {
+        CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+        kernels_.adaptive_sampling_filter_y(
+            kernel_globals, render_buffer, x, full_y, height, offset, stride);
+      });
+    });
+  }
+
+  return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(0, height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+      int pixel_index = y * width;
+
+      for (int x = 0; x < width; ++x, ++pixel_index) {
+        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+      }
+    });
+  });
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
new file mode 100644
index 00000000000..ab729bbf879
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/cpu/kernel_thread_globals.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+struct KernelGlobals;
+
+class CPUKernels;
+
+/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel,
+ * for CPU devices.
+ *
+ * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent
+ * queues on the render device which makes this work be only usable on CPU. */
+class PathTraceWorkCPU : public PathTraceWork {
+ public:
+  PathTraceWorkCPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  /* Core path tracing routine. Renders given work time on the given queue. */
+  void render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                    const KernelWorkTile &work_tile,
+                                    const int samples_num);
+
+  /* CPU kernels. */
+  const CPUKernels &kernels_;
+
+  /* Copy of kernel globals which is suitable for concurrent access from multiple threads.
+   *
+   * More specifically, the `kernel_globals_` is local to each threads and nobody else is
+   * accessing it, but some "localization" is required to decouple from kernel globals stored
+   * on the device level. */
+  vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..10baf869aa6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      queue_(device->gpu_queue_create()),
+      integrator_state_soa_kernel_features_(0),
+      integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+      integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+      integrator_shader_raytrace_sort_counter_(
+          device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_catcher_path_index_(
+          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+      queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+      num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+      work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+      gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+      max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+      min_num_active_paths_(queue_->num_concurrent_busy_states()),
+      max_active_path_index_(0)
+{
+  memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+   * path compaction which relies on the fact that regeneration does not happen sooner than half of
+   * the states are available again. */
+  min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+  /* IntegrateState allocated as structure of arrays. */
+
+  /* Check if we already allocated memory for the required features. */
+  const uint kernel_features = device_scene_->data.kernel_features;
+  if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+    return;
+  }
+  integrator_state_soa_kernel_features_ = kernel_features;
+
+  /* Allocate a device only memory buffer before for each struct member, and then
+   * write the pointers into a struct that resides in constant memory.
+   *
+   * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && \
+      (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_END(name) \
+  break; \
+  }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+  if (array_index == array_size - 1) { \
+    break; \
+  } \
+  }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+  if (integrator_queue_counter_.size() == 0) {
+    integrator_queue_counter_.alloc(1);
+    integrator_queue_counter_.zero_to_device();
+    integrator_queue_counter_.copy_from_device();
+    integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+                                              integrator_queue_counter_.device_pointer;
+  }
+
+  /* Allocate data for active path index arrays. */
+  if (num_queued_paths_.size() == 0) {
+    num_queued_paths_.alloc(1);
+    num_queued_paths_.zero_to_device();
+  }
+
+  if (queued_paths_.size() == 0) {
+    queued_paths_.alloc(max_num_paths_);
+    /* TODO: this could be skip if we had a function to just allocate on device. */
+    queued_paths_.zero_to_device();
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+  /* Allocate arrays for shader sorting. */
+  const int max_shaders = device_scene_->data.max_shaders;
+  if (integrator_shader_sort_counter_.size() < max_shaders) {
+    integrator_shader_sort_counter_.alloc(max_shaders);
+    integrator_shader_sort_counter_.zero_to_device();
+
+    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+    integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+    return;
+  }
+
+  integrator_next_shadow_catcher_path_index_.alloc(1);
+  /* TODO(sergey): Use queue? */
+  integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+  integrator_state_gpu_.next_shadow_catcher_path_index =
+      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+  alloc_integrator_soa();
+  alloc_integrator_queue();
+  alloc_integrator_sorting();
+  alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+  queue_->init_execution();
+
+  /* Copy to device side struct in constant memory. */
+  device_->const_copy_to(
+      "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+   * add more work (because tiles are smaller, so there is higher chance that more paths will
+   * become busy after adding new tiles). This is especially important for the shadow catcher which
+   * schedules work in halves of available number of paths. */
+  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+  work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+  enqueue_reset();
+
+  int num_iterations = 0;
+  uint64_t num_busy_accum = 0;
+
+  /* TODO: set a hard limit in case of undetected kernel failures? */
+  while (true) {
+    /* Enqueue work from the scheduler, on start or when there are not enough
+     * paths to keep the device occupied. */
+    bool finished;
+    if (enqueue_work_tiles(finished)) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    /* Stop if no more work remaining. */
+    if (finished) {
+      break;
+    }
+
+    /* Enqueue on of the path iteration kernels. */
+    if (enqueue_path_iteration()) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    num_busy_accum += get_num_active_paths();
+    ++num_iterations;
+  }
+
+  statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int max_num_queued = 0;
+  DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    if (queue_counter->num_queued[i] > max_num_queued) {
+      kernel = (DeviceKernel)i;
+      max_num_queued = queue_counter->num_queued[i];
+    }
+  }
+
+  return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+  void *args[] = {&max_num_paths_};
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+  queue_->zero_to_device(integrator_queue_counter_);
+  queue_->zero_to_device(integrator_shader_sort_counter_);
+  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+   * counter on the host side because `zero_to_device()` is not doing it. */
+  if (integrator_queue_counter_.host_pointer) {
+    memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+  }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+  /* Find kernel to execute, with max number of queued paths. */
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_active_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    num_active_paths += queue_counter->num_queued[i];
+  }
+
+  if (num_active_paths == 0) {
+    return false;
+  }
+
+  /* Find kernel to execute, with max number of queued paths. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel == DEVICE_KERNEL_NUM) {
+    return false;
+  }
+
+  /* Finish shadows before potentially adding more shadow rays. We can only
+   * store one shadow ray in the integrator state. */
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+      return true;
+    }
+    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+      return true;
+    }
+  }
+
+  /* Schedule kernel with maximum number of queued items. */
+  enqueue_path_iteration(kernel);
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+  void *d_path_index = (void *)NULL;
+
+  /* Create array of path indices for which this kernel is queued to be executed. */
+  int work_size = max_active_path_index_;
+
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  int num_queued = queue_counter->num_queued[kernel];
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    /* Compute array of active paths, sorted by shader. */
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+  }
+  else if (num_queued < work_size) {
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+      /* Compute array of active shadow paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+    }
+    else {
+      /* Compute array of active paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+    }
+  }
+
+  DCHECK_LE(work_size, max_num_paths_);
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+      /* Ray intersection kernels with integrator state. */
+      void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+      /* Shading kernels with integrator state and render buffer. */
+      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+                 << " used for path iteration, should never happen.";
+      break;
+  }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+  void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+  assert(d_counter != nullptr);
+
+  /* Compute prefix sum of number of active paths with each shader. */
+  {
+    const int work_size = 1;
+    int max_shaders = device_scene_->data.max_shaders;
+    void *args[] = {&d_counter, &max_shaders};
+    queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  /* Launch kernel to fill the active paths arrays. */
+  {
+    /* TODO: this could be smaller for terminated paths based on amount of work we want
+     * to schedule. */
+    const int work_size = max_active_path_index_;
+
+    void *d_queued_paths = (void *)queued_paths_.device_pointer;
+    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+    void *args[] = {const_cast<int *>(&work_size),
+                    &d_queued_paths,
+                    &d_num_queued_paths,
+                    &d_counter,
+                    &d_queued_kernel};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+
+  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+    queue_->zero_to_device(integrator_shader_sort_counter_);
+  }
+  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  else {
+    assert(0);
+  }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  const int work_size = max_active_path_index_;
+  void *d_queued_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {
+      const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+  queue_->zero_to_device(num_queued_paths_);
+  queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+  if (num_active_paths == 0) {
+    max_active_path_index_ = 0;
+  }
+
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
+  if (max_active_path_index_ == num_active_paths) {
+    return;
+  }
+
+  void *d_compact_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+  /* Create array with terminated paths that we can write to. */
+  {
+    /* TODO: can the work size be reduced here? */
+    int offset = num_active_paths;
+    int work_size = num_active_paths;
+    void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+  }
+
+  /* Create array of paths that we need to compact, where the path index is bigger
+   * than the number of active paths. */
+  {
+    int work_size = max_active_path_index_;
+    void *args[] = {
+        &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+  }
+
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  int num_compact_paths = num_queued_paths_.data()[0];
+
+  /* Move paths into gaps. */
+  if (num_compact_paths > 0) {
+    int work_size = num_compact_paths;
+    int active_states_offset = 0;
+    int terminated_states_offset = num_active_paths;
+    void *args[] = {
+        &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+  }
+
+  queue_->synchronize();
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+  /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+   * wavefront of the existing and newely added paths. */
+  /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+   * performance. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+    return false;
+  }
+
+  int num_active_paths = get_num_active_paths();
+
+  /* Don't schedule more work if cancelling. */
+  if (is_cancel_requested()) {
+    if (num_active_paths == 0) {
+      finished = true;
+    }
+    return false;
+  }
+
+  finished = false;
+
+  vector<KernelWorkTile> work_tiles;
+
+  int max_num_camera_paths = max_num_paths_;
+  int num_predicted_splits = 0;
+
+  if (has_shadow_catcher()) {
+    /* When there are shadow catchers in the scene bounce from them will split the state. So we
+     * make sure there is enough space in the path states array to fit split states.
+     *
+     * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+     * that all the new paths can be split.
+     *
+     * Note that it is possible that some of the current states can still split, so need to make
+     * sure there is enough space for them as well. */
+
+    /* Number of currently in-flight states which can still split. */
+    const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+    const int num_available_paths = max_num_paths_ - num_active_paths;
+    const int num_new_paths = num_available_paths / 2;
+    max_num_camera_paths = max(num_active_paths,
+                               num_active_paths + num_new_paths - num_scheduled_possible_split);
+    num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+  }
+
+  /* Schedule when we're out of paths or there are too few paths to keep the
+   * device occupied. */
+  int num_paths = num_active_paths;
+  if (num_paths == 0 || num_paths < min_num_active_paths_) {
+    /* Get work tiles until the maximum number of path is reached. */
+    while (num_paths < max_num_camera_paths) {
+      KernelWorkTile work_tile;
+      if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+        work_tiles.push_back(work_tile);
+        num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* If we couldn't get any more tiles, we're done. */
+    if (work_tiles.size() == 0 && num_paths == 0) {
+      finished = true;
+      return false;
+    }
+  }
+
+  /* Initialize paths from work tiles. */
+  if (work_tiles.size() == 0) {
+    return false;
+  }
+
+  /* Compact state array when number of paths becomes small relative to the
+   * known maximum path index, which makes computing active index arrays slow. */
+  compact_states(num_active_paths);
+
+  if (has_shadow_catcher()) {
+    integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+    queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+  }
+
+  enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+                                                      DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+                     work_tiles.data(),
+                     work_tiles.size(),
+                     num_active_paths,
+                     num_predicted_splits);
+
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+                                          const KernelWorkTile work_tiles[],
+                                          const int num_work_tiles,
+                                          const int num_active_paths,
+                                          const int num_predicted_splits)
+{
+  /* Copy work tiles to device. */
+  if (work_tiles_.size() < num_work_tiles) {
+    work_tiles_.alloc(num_work_tiles);
+  }
+
+  int path_index_offset = num_active_paths;
+  int max_tile_work_size = 0;
+  for (int i = 0; i < num_work_tiles; i++) {
+    KernelWorkTile &work_tile = work_tiles_.data()[i];
+    work_tile = work_tiles[i];
+
+    const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+    work_tile.path_index_offset = path_index_offset;
+    work_tile.work_size = tile_work_size;
+
+    path_index_offset += tile_work_size;
+
+    max_tile_work_size = max(max_tile_work_size, tile_work_size);
+  }
+
+  queue_->copy_to_device(work_tiles_);
+
+  void *d_work_tiles = (void *)work_tiles_.device_pointer;
+  void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+  /* Launch kernel. */
+  void *args[] = {&d_work_tiles,
+                  const_cast<int *>(&num_work_tiles),
+                  &d_render_buffer,
+                  const_cast<int *>(&max_tile_work_size)};
+
+  queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+  max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+  /* TODO: this is wrong, does not account for duplicates with shadow! */
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    DCHECK_GE(queue_counter->num_queued[i], 0)
+        << "Invalid number of queued states for kernel "
+        << device_kernel_as_string(static_cast<DeviceKernel>(i));
+    num_paths += queue_counter->num_queued[i];
+  }
+
+  return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+  /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+   * that the GPUDisplay has a single texture:
+   *
+   *   CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+   *   attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+   *   otherwise one would run into a conflict of where the source of truth is. */
+  if (has_multiple_works()) {
+    return false;
+  }
+
+  if (!interop_use_checked_) {
+    Device *device = queue_->device;
+    interop_use_ = device->should_use_graphics_interop();
+
+    if (interop_use_) {
+      VLOG(2) << "Will be using graphics interop GPU display update.";
+    }
+    else {
+      VLOG(2) << "Will be using naive GPU display update.";
+    }
+
+    interop_use_checked_ = true;
+  }
+
+  return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  if (device_->have_error()) {
+    /* Don't attempt to update GPU display if the device has errors: the error state will make
+     * wrong decisions to happen about interop, causing more chained bugs. */
+    return;
+  }
+
+  if (!buffers_->buffer.device_pointer) {
+    LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+    return;
+  }
+
+  if (should_use_graphics_interop()) {
+    if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+      return;
+    }
+
+    /* If error happens when trying to use graphics interop fallback to the native implementation
+     * and don't attempt to use interop for the further updates. */
+    interop_use_ = false;
+  }
+
+  copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+                                                 PassMode pass_mode,
+                                                 int num_samples)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int final_width = buffers_->params.width;
+  const int final_height = buffers_->params.height;
+
+  const int texture_x = full_x - effective_full_params_.full_x;
+  const int texture_y = full_y - effective_full_params_.full_y;
+
+  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+   *
+   * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+   * change of the resolution divider. However, if the display becomes smaller, shrink the
+   * allocated memory as well. */
+  if (gpu_display_rgba_half_.data_width != final_width ||
+      gpu_display_rgba_half_.data_height != final_height) {
+    gpu_display_rgba_half_.alloc(final_width, final_height);
+    /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+     * transfering zeroes to the device. */
+    queue_->zero_to_device(gpu_display_rgba_half_);
+  }
+
+  PassAccessor::Destination destination(film_->get_display_pass());
+  destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  gpu_display_rgba_half_.copy_from_device();
+
+  gpu_display->copy_pixels_to_texture(
+      gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  if (!device_graphics_interop_) {
+    device_graphics_interop_ = queue_->graphics_interop_create();
+  }
+
+  const DeviceGraphicsInteropDestination graphics_interop_dst =
+      gpu_display->graphics_interop_get();
+  device_graphics_interop_->set_destination(graphics_interop_dst);
+
+  const device_ptr d_rgba_half = device_graphics_interop_->map();
+  if (!d_rgba_half) {
+    return false;
+  }
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.d_pixels_half_rgba = d_rgba_half;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  device_graphics_interop_->unmap();
+
+  return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+  if (!device_graphics_interop_) {
+    return;
+  }
+  gpu_display->graphics_interop_activate();
+  device_graphics_interop_ = nullptr;
+  gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+  const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+  pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+  if (num_active_pixels) {
+    enqueue_adaptive_sampling_filter_x();
+    enqueue_adaptive_sampling_filter_y();
+    queue_->synchronize();
+  }
+
+  return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+  device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+  num_active_pixels.alloc(1);
+
+  queue_->zero_to_device(num_active_pixels);
+
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&effective_buffer_params_.full_x),
+                  const_cast<int *>(&effective_buffer_params_.full_y),
+                  const_cast<int *>(&effective_buffer_params_.width),
+                  const_cast<int *>(&effective_buffer_params_.height),
+                  &threshold,
+                  &reset,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride,
+                  &num_active_pixels.device_pointer};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+  queue_->copy_from_device(num_active_pixels);
+  queue_->synchronize();
+
+  return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+  const int work_size = effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+  const int work_size = effective_buffer_params_.width;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&work_size),
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+  queue_->copy_from_device(buffers_->buffer);
+
+  /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+  return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+  queue_->copy_to_device(buffers_->buffer);
+
+  /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+   * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+   * which will perform synchronization as needed. */
+
+  return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+  queue_->zero_to_device(buffers_->buffer);
+
+  return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+  return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+  if (max_active_path_index_ == 0) {
+    return 0;
+  }
+
+  if (!has_shadow_catcher()) {
+    return 0;
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  const int work_size = max_active_path_index_;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
new file mode 100644
index 00000000000..38788122b0d
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/device_graphics_interop.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/work_tile_scheduler.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+
+/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized
+ * to match device queue's number of path states.
+ * This implementation suits best devices which have a lot of integrator states, such as GPU. */
+class PathTraceWorkGPU : public PathTraceWork {
+ public:
+  PathTraceWorkGPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void alloc_work_memory() override;
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  void alloc_integrator_soa();
+  void alloc_integrator_queue();
+  void alloc_integrator_sorting();
+  void alloc_integrator_path_split();
+
+  /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */
+  DeviceKernel get_most_queued_kernel() const;
+
+  void enqueue_reset();
+
+  bool enqueue_work_tiles(bool &finished);
+  void enqueue_work_tiles(DeviceKernel kernel,
+                          const KernelWorkTile work_tiles[],
+                          const int num_work_tiles,
+                          const int num_active_paths,
+                          const int num_predicted_splits);
+
+  bool enqueue_path_iteration();
+  void enqueue_path_iteration(DeviceKernel kernel);
+
+  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+
+  void compact_states(const int num_active_paths);
+
+  int get_num_active_paths();
+
+  /* Check whether graphics interop can be used for the GPUDisplay update. */
+  bool should_use_graphics_interop();
+
+  /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
+   * device, then copies pixels to the host and pushes them to the `gpu_display`. */
+  void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+   * functionality, avoiding copy of pixels to the host. */
+  bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Synchronously run film conversion kernel and store display result in the given destination. */
+  void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                   PassMode pass_mode,
+                                   int num_samples);
+
+  int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
+  void enqueue_adaptive_sampling_filter_x();
+  void enqueue_adaptive_sampling_filter_y();
+
+  bool has_shadow_catcher() const;
+
+  /* Count how many currently scheduled paths can still split. */
+  int shadow_catcher_count_possible_splits();
+
+  /* Integrator queue. */
+  unique_ptr<DeviceQueue> queue_;
+
+  /* Scheduler which gives work to path tracing threads. */
+  WorkTileScheduler work_tile_scheduler_;
+
+  /* Integrate state for paths. */
+  IntegratorStateGPU integrator_state_gpu_;
+  /* SoA arrays for integrator state. */
+  vector<unique_ptr<device_memory>> integrator_state_soa_;
+  uint integrator_state_soa_kernel_features_;
+  /* Keep track of number of queued kernels. */
+  device_vector<IntegratorQueueCounter> integrator_queue_counter_;
+  /* Shader sorting. */
+  device_vector<int> integrator_shader_sort_counter_;
+  device_vector<int> integrator_shader_raytrace_sort_counter_;
+  /* Path split. */
+  device_vector<int> integrator_next_shadow_catcher_path_index_;
+
+  /* Temporary buffer to get an array of queued path for a particular kernel. */
+  device_vector<int> queued_paths_;
+  device_vector<int> num_queued_paths_;
+
+  /* Temporary buffer for passing work tiles to kernel. */
+  device_vector<KernelWorkTile> work_tiles_;
+
+  /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+   * available. Is allocated on-demand. */
+  device_vector<half4> gpu_display_rgba_half_;
+
+  unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
+
+  /* Cached result of device->should_use_graphics_interop(). */
+  bool interop_use_checked_ = false;
+  bool interop_use_ = false;
+
+  /* Maximum number of concurrent integrator states. */
+  int max_num_paths_;
+
+  /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+   * this value more work will be scheduled. */
+  int min_num_active_paths_;
+
+  /* Maximum path index, effective number of paths used may be smaller than
+   * the size of the integrator_state_ buffer so can avoid iterating over the
+   * full buffer. */
+  int max_active_path_index_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
new file mode 100644
index 00000000000..4eb1dd941f9
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/render_scheduler.h"
+
+#include "render/session.h"
+#include "render/tile.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Render scheduler.
+ */
+
+RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams &params)
+    : headless_(params.headless),
+      background_(params.background),
+      pixel_size_(params.pixel_size),
+      tile_manager_(tile_manager),
+      default_start_resolution_divider_(pixel_size_ * 8)
+{
+  use_progressive_noise_floor_ = !background_;
+}
+
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+  need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
+void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
+{
+  need_schedule_rebalance_works_ = need_schedule_rebalance;
+}
+
+bool RenderScheduler::is_background() const
+{
+  return background_;
+}
+
+void RenderScheduler::set_denoiser_params(const DenoiseParams &params)
+{
+  denoiser_params_ = params;
+}
+
+void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  adaptive_sampling_ = adaptive_sampling;
+}
+
+bool RenderScheduler::is_adaptive_sampling_used() const
+{
+  return adaptive_sampling_.use;
+}
+
+void RenderScheduler::set_start_sample(int start_sample)
+{
+  start_sample_ = start_sample;
+}
+
+int RenderScheduler::get_start_sample() const
+{
+  return start_sample_;
+}
+
+void RenderScheduler::set_num_samples(int num_samples)
+{
+  num_samples_ = num_samples;
+}
+
+int RenderScheduler::get_num_samples() const
+{
+  return num_samples_;
+}
+
+void RenderScheduler::set_time_limit(double time_limit)
+{
+  time_limit_ = time_limit;
+}
+
+double RenderScheduler::get_time_limit() const
+{
+  return time_limit_;
+}
+
+int RenderScheduler::get_rendered_sample() const
+{
+  DCHECK_GT(get_num_rendered_samples(), 0);
+
+  return start_sample_ + get_num_rendered_samples() - 1;
+}
+
+int RenderScheduler::get_num_rendered_samples() const
+{
+  return state_.num_rendered_samples;
+}
+
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+{
+  buffer_params_ = buffer_params;
+
+  update_start_resolution_divider();
+
+  set_num_samples(num_samples);
+
+  /* In background mode never do lower resolution render preview, as it is not really supported
+   * by the software. */
+  if (background_) {
+    state_.resolution_divider = 1;
+  }
+  else {
+    /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider
+     * first and then initialized render work. */
+    state_.resolution_divider = start_resolution_divider_ * 2;
+  }
+
+  state_.num_rendered_samples = 0;
+  state_.last_display_update_time = 0.0;
+  state_.last_display_update_sample = -1;
+
+  state_.last_rebalance_time = 0.0;
+  state_.num_rebalance_requested = 0;
+  state_.num_rebalance_changes = 0;
+  state_.last_rebalance_changed = false;
+  state_.need_rebalance_at_next_work = false;
+
+  /* TODO(sergey): Choose better initial value. */
+  /* NOTE: The adaptive sampling settings might not be available here yet. */
+  state_.adaptive_sampling_threshold = 0.4f;
+
+  state_.last_work_tile_was_denoised = false;
+  state_.tile_result_was_written = false;
+  state_.postprocess_work_scheduled = false;
+  state_.full_frame_work_scheduled = false;
+  state_.full_frame_was_written = false;
+
+  state_.path_trace_finished = false;
+
+  state_.start_render_time = 0.0;
+  state_.end_render_time = 0.0;
+  state_.time_limit_reached = false;
+
+  state_.occupancy_num_samples = 0;
+  state_.occupancy = 1.0f;
+
+  first_render_time_.path_trace_per_sample = 0.0;
+  first_render_time_.denoise_time = 0.0;
+  first_render_time_.display_update_time = 0.0;
+
+  path_trace_time_.reset();
+  denoise_time_.reset();
+  adaptive_filter_time_.reset();
+  display_update_time_.reset();
+  rebalance_time_.reset();
+}
+
+void RenderScheduler::reset_for_next_tile()
+{
+  reset(buffer_params_, num_samples_);
+}
+
+bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
+{
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (render_work_reschedule_on_idle(render_work)) {
+    return true;
+  }
+
+  state_.path_trace_finished = true;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  return false;
+}
+
+bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work)
+{
+  if (!use_progressive_noise_floor_) {
+    return false;
+  }
+
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (adaptive_sampling_.use) {
+    if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) {
+      state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2,
+                                               adaptive_sampling_.threshold);
+
+      render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold;
+      render_work.adaptive_sampling.reset = true;
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work)
+{
+  VLOG(3) << "Schedule work for cancel.";
+
+  /* Un-schedule samples: they will not be rendered and should not be counted. */
+  state_.num_rendered_samples -= render_work.path_trace.num_samples;
+
+  const bool has_rendered_samples = get_num_rendered_samples() != 0;
+
+  /* Reset all fields of the previous work, canelling things like adaptive sampling filtering and
+   * denoising.
+   * However, need to preserve write requests, since those will not be possible to recover and
+   * writes are only to happen once. */
+  const bool tile_write = render_work.tile.write;
+  const bool full_write = render_work.full.write;
+
+  render_work = RenderWork();
+
+  render_work.tile.write = tile_write;
+  render_work.full.write = full_write;
+
+  /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which
+   * got cancelled. */
+  if (!state_.tile_result_was_written && has_rendered_samples) {
+    render_work.tile.write = true;
+  }
+
+  if (!state_.full_frame_was_written) {
+    render_work.full.write = true;
+  }
+
+  /* Update current tile, but only if any sample was rendered.
+   * Allows to have latest state of tile visible while full buffer is being processed.
+   *
+   * Note that if there are no samples in the current tile its render buffer might have pixels
+   * remained from previous state.
+   *
+   * If the full result was written, then there is no way any updates were made to the render
+   * buffers. And the buffers might have been freed from the device, so display update is not
+   * possible. */
+  if (has_rendered_samples && !state_.full_frame_was_written) {
+    render_work.display.update = true;
+  }
+}
+
+bool RenderScheduler::done() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (state_.path_trace_finished || state_.time_limit_reached) {
+    return true;
+  }
+
+  return get_num_rendered_samples() >= num_samples_;
+}
+
+RenderWork RenderScheduler::get_render_work()
+{
+  check_time_limit_reached();
+
+  const double time_now = time_dt();
+
+  if (done()) {
+    RenderWork render_work;
+    render_work.resolution_divider = state_.resolution_divider;
+
+    if (!set_postprocess_render_work(&render_work)) {
+      set_full_frame_render_work(&render_work);
+    }
+
+    if (!render_work) {
+      state_.end_render_time = time_now;
+    }
+
+    update_state_for_render_work(render_work);
+
+    return render_work;
+  }
+
+  RenderWork render_work;
+
+  if (state_.resolution_divider != pixel_size_) {
+    state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_);
+    state_.num_rendered_samples = 0;
+    state_.last_display_update_sample = -1;
+  }
+
+  render_work.resolution_divider = state_.resolution_divider;
+
+  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
+  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+
+  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
+
+  /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */
+  render_work.rebalance = work_need_rebalance();
+
+  /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the
+   * samples are rendered. */
+  state_.num_rendered_samples += render_work.path_trace.num_samples;
+
+  render_work.adaptive_sampling.filter = work_need_adaptive_filter();
+  render_work.adaptive_sampling.threshold = work_adaptive_threshold();
+  render_work.adaptive_sampling.reset = false;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.tile.write = done();
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  if (done()) {
+    set_postprocess_render_work(&render_work);
+  }
+
+  update_state_for_render_work(render_work);
+
+  return render_work;
+}
+
+void RenderScheduler::update_state_for_render_work(const RenderWork &render_work)
+{
+  const double time_now = time_dt();
+
+  if (render_work.rebalance) {
+    state_.last_rebalance_time = time_now;
+    ++state_.num_rebalance_requested;
+  }
+
+  /* A fallback display update time, for the case there is an error of display update, or when
+   * there is no display at all. */
+  if (render_work.display.update) {
+    state_.last_display_update_time = time_now;
+    state_.last_display_update_sample = state_.num_rendered_samples;
+  }
+
+  state_.last_work_tile_was_denoised = render_work.tile.denoise;
+  state_.tile_result_was_written |= render_work.tile.write;
+  state_.full_frame_was_written |= render_work.full.write;
+}
+
+bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
+{
+  if (state_.postprocess_work_scheduled) {
+    return false;
+  }
+  state_.postprocess_work_scheduled = true;
+
+  bool any_scheduled = false;
+
+  if (need_schedule_cryptomatte_) {
+    render_work->cryptomatte.postprocess = true;
+    any_scheduled = true;
+  }
+
+  if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
+    render_work->tile.denoise = true;
+    any_scheduled = true;
+  }
+
+  if (!state_.tile_result_was_written) {
+    render_work->tile.write = true;
+    any_scheduled = true;
+  }
+
+  if (any_scheduled) {
+    render_work->display.update = true;
+  }
+
+  return any_scheduled;
+}
+
+void RenderScheduler::set_full_frame_render_work(RenderWork *render_work)
+{
+  if (state_.full_frame_work_scheduled) {
+    return;
+  }
+
+  if (!tile_manager_.has_multiple_tiles()) {
+    /* There is only single tile, so all work has been performed already. */
+    return;
+  }
+
+  if (!tile_manager_.done()) {
+    /* There are still tiles to be rendered. */
+    return;
+  }
+
+  if (state_.full_frame_was_written) {
+    return;
+  }
+
+  state_.full_frame_work_scheduled = true;
+
+  render_work->full.write = true;
+}
+
+/* Knowing time which it took to complete a task at the current resolution divider approximate how
+ * long it would have taken to complete it at a final resolution. */
+static double approximate_final_time(const RenderWork &render_work, double time)
+{
+  if (render_work.resolution_divider == 1) {
+    return time;
+  }
+
+  const double resolution_divider_sq = render_work.resolution_divider *
+                                       render_work.resolution_divider;
+  return time * resolution_divider_sq;
+}
+
+void RenderScheduler::report_work_begin(const RenderWork &render_work)
+{
+  /* Start counting render time when rendering samples at their final resolution.
+   *
+   * NOTE: The work might have the path trace part be all zero: this happens when a post-processing
+   * work is scheduled after the path tracing. Checking for just a start sample doesn't work here
+   * because it might be wrongly 0. Check for whether path tracing is actually happening as it is
+   * expected to happen in the first work. */
+  if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 &&
+      render_work.path_trace.start_sample == get_start_sample()) {
+    state_.start_render_time = time_dt();
+  }
+}
+
+void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
+                                             double time,
+                                             bool is_cancelled)
+{
+  path_trace_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.path_trace_per_sample = final_time_approx /
+                                               render_work.path_trace.num_samples;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    path_trace_time_.reset_average();
+  }
+
+  path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+  state_.occupancy_num_samples = render_work.path_trace.num_samples;
+  state_.occupancy = occupancy;
+  VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
+void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
+                                                  double time,
+                                                  bool is_cancelled)
+{
+  adaptive_filter_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_report_reset_average(render_work)) {
+    adaptive_filter_time_.reset_average();
+  }
+
+  adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average adaptive sampling filter  time: " << adaptive_filter_time_.get_average()
+          << " seconds.";
+}
+
+void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time)
+{
+  denoise_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.denoise_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    denoise_time_.reset_average();
+  }
+
+  denoise_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time)
+{
+  display_update_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.display_update_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    display_update_time_.reset_average();
+  }
+
+  display_update_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds.";
+
+  /* Move the display update moment further in time, so that logic which checks when last update
+   * did happen have more reliable point in time (without path tracing and denoising parts of the
+   * render work). */
+  state_.last_display_update_time = time_dt();
+}
+
+void RenderScheduler::report_rebalance_time(const RenderWork &render_work,
+                                            double time,
+                                            bool balance_changed)
+{
+  rebalance_time_.add_wall(time);
+
+  if (work_report_reset_average(render_work)) {
+    rebalance_time_.reset_average();
+  }
+
+  rebalance_time_.add_average(time);
+
+  if (balance_changed) {
+    ++state_.num_rebalance_changes;
+  }
+
+  state_.last_rebalance_changed = balance_changed;
+
+  VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds.";
+}
+
+string RenderScheduler::full_report() const
+{
+  const double render_wall_time = state_.end_render_time - state_.start_render_time;
+  const int num_rendered_samples = get_num_rendered_samples();
+
+  string result = "\nRender Scheduler Summary\n\n";
+
+  {
+    string mode;
+    if (headless_) {
+      mode = "Headless";
+    }
+    else if (background_) {
+      mode = "Background";
+    }
+    else {
+      mode = "Interactive";
+    }
+    result += "Mode: " + mode + "\n";
+  }
+
+  result += "Resolution: " + to_string(buffer_params_.width) + "x" +
+            to_string(buffer_params_.height) + "\n";
+
+  result += "\nAdaptive sampling:\n";
+  result += "  Use: " + string_from_bool(adaptive_sampling_.use) + "\n";
+  if (adaptive_sampling_.use) {
+    result += "  Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n";
+    result += "  Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n";
+    result += "  Threshold: " + to_string(adaptive_sampling_.threshold) + "\n";
+  }
+
+  result += "\nDenoiser:\n";
+  result += "  Use: " + string_from_bool(denoiser_params_.use) + "\n";
+  if (denoiser_params_.use) {
+    result += "  Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n";
+    result += "  Start Sample: " + to_string(denoiser_params_.start_sample) + "\n";
+
+    string passes = "Color";
+    if (denoiser_params_.use_pass_albedo) {
+      passes += ", Albedo";
+    }
+    if (denoiser_params_.use_pass_normal) {
+      passes += ", Normal";
+    }
+
+    result += "  Passes: " + passes + "\n";
+  }
+
+  if (state_.num_rebalance_requested) {
+    result += "\nRebalancer:\n";
+    result += "  Number of requested rebalances: " + to_string(state_.num_rebalance_requested) +
+              "\n";
+    result += "  Number of performed rebalances: " + to_string(state_.num_rebalance_changes) +
+              "\n";
+  }
+
+  result += "\nTime (in seconds):\n";
+  result += string_printf("  %20s %20s %20s\n", "", "Wall", "Average");
+  result += string_printf("  %20s %20f %20f\n",
+                          "Path Tracing",
+                          path_trace_time_.get_wall(),
+                          path_trace_time_.get_average());
+
+  if (adaptive_sampling_.use) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Adaptive Filter",
+                            adaptive_filter_time_.get_wall(),
+                            adaptive_filter_time_.get_average());
+  }
+
+  if (denoiser_params_.use) {
+    result += string_printf(
+        "  %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average());
+  }
+
+  result += string_printf("  %20s %20f %20f\n",
+                          "Display Update",
+                          display_update_time_.get_wall(),
+                          display_update_time_.get_average());
+
+  if (state_.num_rebalance_requested) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Rebalance",
+                            rebalance_time_.get_wall(),
+                            rebalance_time_.get_average());
+  }
+
+  const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() +
+                            denoise_time_.get_wall() + display_update_time_.get_wall();
+  result += "\n  Total: " + to_string(total_time) + "\n";
+
+  result += string_printf(
+      "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time);
+
+  /* When adaptive sampling is used the average time becomes meaningless, because different samples
+   * will likely render different number of pixels. */
+  if (!adaptive_sampling_.use) {
+    result += string_printf("Average time per sample: %f seconds\n",
+                            render_wall_time / num_rendered_samples);
+  }
+
+  return result;
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds() const
+{
+  return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples);
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples(
+    int num_rendered_samples) const
+{
+  double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      num_rendered_samples);
+
+  if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+    const double remaining_render_time = max(0.0,
+                                             time_limit_ - (time_dt() - state_.start_render_time));
+
+    update_interval = min(update_interval, remaining_render_time);
+  }
+
+  return update_interval;
+}
+
+/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based
+ * on a more careful experiments with viewport rendering. */
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+    int num_rendered_samples) const
+{
+  /* TODO(sergey): Need a decision on whether this should be using number of samples rendered
+   * within the current render session, or use absolute number of samples with the start sample
+   * taken into account. It will depend on whether the start sample offset clears the render
+   * buffer. */
+
+  if (state_.need_rebalance_at_next_work) {
+    return 0.1;
+  }
+  if (state_.last_rebalance_changed) {
+    return 0.2;
+  }
+
+  if (headless_) {
+    /* In headless mode do rare updates, so that the device occupancy is high, but there are still
+     * progress messages printed to the logs. */
+    return 30.0;
+  }
+
+  if (background_) {
+    if (num_rendered_samples < 32) {
+      return 1.0;
+    }
+    return 2.0;
+  }
+
+  /* Render time and number of samples rendered are used to figure out the display update interval.
+   *  Render time is used to allow for fast display updates in the first few seconds of rendering
+   *  on fast devices. Number of samples rendered is used to allow for potentially quicker display
+   *  updates on slow devices during the first few samples. */
+  const double render_time = path_trace_time_.get_wall();
+  if (render_time < 1) {
+    return 0.1;
+  }
+  if (render_time < 2) {
+    return 0.25;
+  }
+  if (render_time < 4) {
+    return 0.5;
+  }
+  if (render_time < 8 || num_rendered_samples < 32) {
+    return 1.0;
+  }
+  return 2.0;
+}
+
+int RenderScheduler::calculate_num_samples_per_update() const
+{
+  const double time_per_sample_average = path_trace_time_.get_average();
+  const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average;
+
+  const double update_interval_in_seconds = guess_display_update_interval_in_seconds();
+
+  return max(int(num_samples_in_second * update_interval_in_seconds), 1);
+}
+
+int RenderScheduler::get_start_sample_to_path_trace() const
+{
+  return start_sample_ + state_.num_rendered_samples;
+}
+
+/* Round number of samples to the closest power of two.
+ * Rounding might happen to higher or lower value depending on which one is closer. Such behavior
+ * allows to have number of samples to be power of two without diverging from the planned number of
+ * samples too much. */
+static inline uint round_num_samples_to_power_of_2(const uint num_samples)
+{
+  if (num_samples == 1) {
+    return 1;
+  }
+
+  if (is_power_of_two(num_samples)) {
+    return num_samples;
+  }
+
+  const uint num_samples_up = next_power_of_two(num_samples);
+  const uint num_samples_down = num_samples_up - (num_samples_up >> 1);
+
+  const uint delta_up = num_samples_up - num_samples;
+  const uint delta_down = num_samples - num_samples_down;
+
+  if (delta_up <= delta_down) {
+    return num_samples_up;
+  }
+
+  return num_samples_down;
+}
+
+int RenderScheduler::get_num_samples_to_path_trace() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return get_num_samples_during_navigation(state_.resolution_divider);
+  }
+
+  /* Always start full resolution render  with a single sample. Gives more instant feedback to
+   * artists, and allows to gather information for a subsequent path tracing works. Do it in the
+   * headless mode as well, to give some estimate of how long samples are taking. */
+  if (state_.num_rendered_samples == 0) {
+    return 1;
+  }
+
+  const int num_samples_per_update = calculate_num_samples_per_update();
+  const int path_trace_start_sample = get_start_sample_to_path_trace();
+
+  /* Round number of samples to a power of two, so that division of path states into tiles goes in
+   * a more integer manner.
+   * This might make it so updates happens more rarely due to rounding up. In the test scenes this
+   * is not huge deal because it is not seen that more than 8 samples can be rendered between
+   * updates. If that becomes a problem we can add some extra rules like never allow to round up
+   * more than N samples. */
+  const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
+
+  const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+  int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+  /* When enough statistics is available and doing an offlien rendering prefer to keep device
+   * occupied. */
+  if (state_.occupancy_num_samples && (background_ || headless_)) {
+    /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+     * with good performance without forcing occupancy to be higher). */
+    int num_samples_to_occupy = state_.occupancy_num_samples;
+    if (state_.occupancy < 0.5f) {
+      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+    }
+
+    num_samples_to_render = max(num_samples_to_render,
+                                min(num_samples_to_occupy, max_num_samples_to_render));
+  }
+
+  /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
+   * device fully occupied, without much overhead of display updates. */
+  if (!adaptive_sampling_.use) {
+    return num_samples_to_render;
+  }
+
+  /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This
+   * is to ensure that the final render is pixel-matched regardless of how many samples per second
+   * compute device can do. */
+
+  return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+}
+
+int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
+{
+  /* Special trick for fast navigation: schedule multiple samples during fast navigation
+   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
+   * usable visual feedback for artists. There are a couple of tricks though. */
+
+  if (is_denoise_active_during_update()) {
+    /* When denoising is used during navigation prefer using a higher resolution with less samples
+     * (scheduling less samples here will make it so the resolution_divider calculation will use a
+     * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser
+     * give visually better results on a higher resolution image with less samples. */
+    return 1;
+  }
+
+  if (resolution_divider <= pixel_size_) {
+    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+     * the sample count at this resolution division, but instead assists in the calculation of
+     * the resolution divider. */
+    return 1;
+  }
+
+  if (resolution_divider == pixel_size_ * 2) {
+    /* When resolution divider is the previous step to the final resolution, schedule two samples.
+     * This is so that rendering on lower resolution does not exceed time that it takes to render
+     * first sample at the full resolution. */
+    return 2;
+  }
+
+  /* Always render 4 samples, even if scene is configured for less.
+   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
+   * to have 4 time extra samples, so verall worst case timing is the same as the final resolution
+   * at one sample. */
+  return 4;
+}
+
+bool RenderScheduler::work_need_adaptive_filter() const
+{
+  return adaptive_sampling_.need_filter(get_rendered_sample());
+}
+
+float RenderScheduler::work_adaptive_threshold() const
+{
+  if (!use_progressive_noise_floor_) {
+    return adaptive_sampling_.threshold;
+  }
+
+  return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
+}
+
+bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
+{
+  delayed = false;
+  ready_to_display = true;
+
+  if (!denoiser_params_.use) {
+    /* Denoising is disabled, no need to scheduler work for it. */
+    return false;
+  }
+
+  if (done()) {
+    /* Always denoise at the last sample. */
+    return true;
+  }
+
+  if (background_) {
+    /* Background render, only denoise when rendering the last sample. */
+    /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised
+     * image looks like even for the background rendering. */
+    return false;
+  }
+
+  /* Viewport render. */
+
+  /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as
+   * final samples. */
+  const int num_samples_finished = state_.resolution_divider == pixel_size_ ?
+                                       state_.num_rendered_samples :
+                                       1;
+
+  /* Immediately denoise when we reach the start sample or last sample. */
+  if (num_samples_finished == denoiser_params_.start_sample ||
+      num_samples_finished == num_samples_) {
+    return true;
+  }
+
+  /* Do not denoise until the sample at which denoising should start is reached. */
+  if (num_samples_finished < denoiser_params_.start_sample) {
+    ready_to_display = false;
+    return false;
+  }
+
+  /* Avoid excessive denoising in viewport after reaching a certain sample count and render time.
+   */
+  /* TODO(sergey): Consider making time interval and sample configurable. */
+  delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 &&
+             (time_dt() - state_.last_display_update_time) < 1.0);
+
+  return !delayed;
+}
+
+bool RenderScheduler::work_need_update_display(const bool denoiser_delayed)
+{
+  if (headless_) {
+    /* Force disable display update in headless mode. There will be nothing to display the
+     * in-progress result. */
+    return false;
+  }
+
+  if (denoiser_delayed) {
+    /* If denoiser has been delayed the display can not be updated as it will not contain
+     * up-to-date state of the render result. */
+    return false;
+  }
+
+  if (!adaptive_sampling_.use) {
+    /* When adaptive sampling is not used the work is scheduled in a way that they keep render
+     * device busy for long enough, so that the display update can happen right after the
+     * rendering. */
+    return true;
+  }
+
+  if (done() || state_.last_display_update_sample == -1) {
+    /* Make sure an initial and final results of adaptive sampling is communicated ot the display.
+     */
+    return true;
+  }
+
+  /* For the development purposes of adaptive sampling it might be very useful to see all updates
+   * of active pixels after convergence check. However, it would cause a slowdown for regular usage
+   * users. Possibly, make it a debug panel option to allow rapid update to ease development
+   * without need to re-compiled. */
+  // if (work_need_adaptive_filter()) {
+  //   return true;
+  // }
+
+  /* When adaptive sampling is used, its possible that only handful of samples of a very simple
+   * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points).
+   * We take care of skipping updates here based on when previous display update did happen. */
+  const double update_interval = guess_display_update_interval_in_seconds_for_num_samples(
+      state_.last_display_update_sample);
+  return (time_dt() - state_.last_display_update_time) > update_interval;
+}
+
+bool RenderScheduler::work_need_rebalance()
+{
+  /* This is the minimum time, as the rebalancing can not happen more often than the path trace
+   * work. */
+  static const double kRebalanceIntervalInSeconds = 1;
+
+  if (!need_schedule_rebalance_works_) {
+    return false;
+  }
+
+  if (state_.resolution_divider != pixel_size_) {
+    /* Don't rebalance at a non-final resolution divider. Some reasons for this:
+     *  - It will introduce unnecessary during navigation.
+     *  - Per-render device timing information is not very reliable yet. */
+    return false;
+  }
+
+  if (state_.num_rendered_samples == 0) {
+    state_.need_rebalance_at_next_work = true;
+    return false;
+  }
+
+  if (state_.need_rebalance_at_next_work) {
+    state_.need_rebalance_at_next_work = false;
+    return true;
+  }
+
+  if (state_.last_rebalance_changed) {
+    return true;
+  }
+
+  return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
+}
+
+void RenderScheduler::update_start_resolution_divider()
+{
+  if (start_resolution_divider_ == 0) {
+    /* Resolution divider has never been calculated before: use default resolution, so that we have
+     * somewhat good initial behavior, giving a chance to collect real numbers. */
+    start_resolution_divider_ = default_start_resolution_divider_;
+    VLOG(3) << "Initial resolution divider is " << start_resolution_divider_;
+    return;
+  }
+
+  if (first_render_time_.path_trace_per_sample == 0.0) {
+    /* Not enough information to calculate better resolution, keep the existing one. */
+    return;
+  }
+
+  const double desired_update_interval_in_seconds =
+      guess_viewport_navigation_update_interval_in_seconds();
+
+  const double actual_time_per_update = first_render_time_.path_trace_per_sample +
+                                        first_render_time_.denoise_time +
+                                        first_render_time_.display_update_time;
+
+  /* Allow some percent of tolerance, so that if the render time is close enough to the higher
+   * resolution we prefer to use it instead of going way lower resolution and time way below the
+   * desired one. */
+  const int resolution_divider_for_update = calculate_resolution_divider_for_time(
+      desired_update_interval_in_seconds * 1.4, actual_time_per_update);
+
+  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
+   * render time is somewhere on a boundary between two resolutions. */
+
+  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+   * simple and compute device is fast). */
+  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+
+  VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_;
+}
+
+double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const
+{
+  if (is_denoise_active_during_update()) {
+    /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the
+     * image from. With the faster updates and extra compute required the resolution becomes too
+     * low to give usable feedback. */
+    /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser
+     * on GPU the value might need to become lower for faster navigation. */
+    return 1.0 / 12.0;
+  }
+
+  /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will
+   * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high
+   * values of the resolution divider which does not give very pleasant updates during navigation.
+   * Choose less frequent updates to allow more noise-free and higher resolution updates. */
+
+  /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider
+   * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */
+
+  return 1.0 / 30.0;
+}
+
+bool RenderScheduler::is_denoise_active_during_update() const
+{
+  if (!denoiser_params_.use) {
+    return false;
+  }
+
+  if (denoiser_params_.start_sample > 1) {
+    return false;
+  }
+
+  return true;
+}
+
+bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work)
+{
+  return render_work.resolution_divider == pixel_size_ &&
+         render_work.path_trace.start_sample == start_sample_;
+}
+
+bool RenderScheduler::work_report_reset_average(const RenderWork &render_work)
+{
+  /* When rendering at a non-final resolution divider time average is not very useful because it
+   * will either bias average down (due to lower render times on the smaller images) or will give
+   * incorrect result when trying to estimate time which would have spent on the final resolution.
+   *
+   * So we only accumulate average for the latest resolution divider which was rendered. */
+  return render_work.resolution_divider != pixel_size_;
+}
+
+void RenderScheduler::check_time_limit_reached()
+{
+  if (time_limit_ == 0.0) {
+    /* No limit is enforced. */
+    return;
+  }
+
+  if (state_.start_render_time == 0.0) {
+    /* Rendering did not start yet. */
+    return;
+  }
+
+  const double current_time = time_dt();
+
+  if (current_time - state_.start_render_time < time_limit_) {
+    /* Time limit is not reached yet. */
+    return;
+  }
+
+  state_.time_limit_reached = true;
+  state_.end_render_time = current_time;
+}
+
+/* --------------------------------------------------------------------
+ * Utility functions.
+ */
+
+int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
+{
+  /* TODO(sergey): There should a non-iterative analytical formula here. */
+
+  int resolution_divider = 1;
+
+  /* This algorithm iterates through resolution dividers until a divider is found that achieves
+   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
+   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
+   * pre_resolution_division_samples and post_resolution_division_samples are used in this
+   * calculation to better predict the performance impact of changing resolution divisions as
+   * the sample count can also change between resolution divisions. */
+  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
+    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    resolution_divider = resolution_divider * 2;
+    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
+{
+  if (resolution == INT_MAX) {
+    return 1;
+  }
+
+  int resolution_divider = 1;
+  while (width * height > resolution * resolution) {
+    width = max(1, width / 2);
+    height = max(1, height / 2);
+
+    resolution_divider <<= 1;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider)
+{
+  const int pixel_area = width * height;
+  const int resolution = lround(sqrt(pixel_area));
+
+  return resolution / resolution_divider;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
new file mode 100644
index 00000000000..9c2d107e46d
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/adaptive_sampling.h"
+#include "integrator/denoiser.h" /* For DenoiseParams. */
+#include "render/buffers.h"
+#include "util/util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SessionParams;
+class TileManager;
+
+class RenderWork {
+ public:
+  int resolution_divider = 1;
+
+  /* Initialize render buffers.
+   * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+   * baking target. */
+  bool init_render_buffers = false;
+
+  /* Path tracing samples information. */
+  struct {
+    int start_sample = 0;
+    int num_samples = 0;
+  } path_trace;
+
+  struct {
+    /* Check for convergency and filter the mask. */
+    bool filter = false;
+
+    float threshold = 0.0f;
+
+    /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */
+    bool reset = false;
+  } adaptive_sampling;
+
+  struct {
+    bool postprocess = false;
+  } cryptomatte;
+
+  /* Work related on the current tile. */
+  struct {
+    /* Write render buffers of the current tile.
+     *
+     * It is up to the path trace to decide whether writing should happen via user-provided
+     * callback into the rendering software, or via tile manager into a partial file. */
+    bool write = false;
+
+    bool denoise = false;
+  } tile;
+
+  /* Work related on the full-frame render buffer. */
+  struct {
+    /* Write full render result.
+     * Implies reading the partial file from disk. */
+    bool write = false;
+  } full;
+
+  /* Display which is used to visualize render result. */
+  struct {
+    /* Display needs to be updated for the new render. */
+    bool update = false;
+
+    /* Display can use denoised result if available. */
+    bool use_denoised_result = true;
+  } display;
+
+  /* Re-balance multi-device scheduling after rendering this work.
+   * Note that the scheduler does not know anything abouce devices, so if there is only a single
+   * device used, then it is up for the PathTracer to ignore the balancing. */
+  bool rebalance = false;
+
+  /* Conversion to bool, to simplify checks about whether there is anything to be done for this
+   * work. */
+  inline operator bool() const
+  {
+    return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise ||
+           tile.write || full.write;
+  }
+};
+
+class RenderScheduler {
+ public:
+  RenderScheduler(TileManager &tile_manager, const SessionParams &params);
+
+  /* Specify whether cryptomatte-related works are to be scheduled. */
+  void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte);
+
+  /* Allows to disable work re-balancing works, allowing to schedule as much to a single device
+   * as possible. */
+  void set_need_schedule_rebalance(bool need_schedule_rebalance);
+
+  bool is_background() const;
+
+  void set_denoiser_params(const DenoiseParams &params);
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  bool is_adaptive_sampling_used() const;
+
+  /* Start sample for path tracing.
+   * The scheduler will schedule work using this sample as the first one. */
+  void set_start_sample(int start_sample);
+  int get_start_sample() const;
+
+  /* Number of samples to render, starting from start sample.
+   * The scheduler will schedule work in the range of
+   * [start_sample, start_sample + num_samples - 1], inclusively. */
+  void set_num_samples(int num_samples);
+  int get_num_samples() const;
+
+  /* Time limit for the path tracing tasks, in minutes.
+   * Zero disables the limit. */
+  void set_time_limit(double time_limit);
+  double get_time_limit() const;
+
+  /* Get sample up to which rendering has been done.
+   * This is an absolute 0-based value.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 14.
+   *
+   * If there were no samples rendered, then the behavior is undefined. */
+  int get_rendered_sample() const;
+
+  /* Get number of samples rendered within the current scheduling session.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 5.
+   *
+   * Note that this is based on the scheduling information. In practice this means that if someone
+   * requested for work to render the scheduler considers the work done. */
+  int get_num_rendered_samples() const;
+
+  /* Reset scheduler, indicating that rendering will happen from scratch.
+   * Resets current rendered state, as well as scheduling information. */
+  void reset(const BufferParams &buffer_params, int num_samples);
+
+  /* Reset scheduler upon switching to a next tile.
+   * Will keep the same number of samples and full-frame render parameters, but will reset progress
+   * and allow schedule renders works from the beginning of the new tile. */
+  void reset_for_next_tile();
+
+  /* Reschedule adaptive sampling work when all pixels did converge.
+   * If there is nothing else to be done for the adaptive sampling (pixels did converge to the
+   * final threshold) then false is returned and the render scheduler will stop scheduling path
+   * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with
+   * a lower threshold. */
+  bool render_work_reschedule_on_converge(RenderWork &render_work);
+
+  /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet
+   * converged.
+   * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and
+   * the path tracer is to finish the current pixels) then false is returned. */
+  bool render_work_reschedule_on_idle(RenderWork &render_work);
+
+  /* Reschedule work when rendering has been requested to cancel.
+   *
+   * Will skip all work which is not needed anymore because no more samples will be added (for
+   * example, adaptive sampling filtering and convergence check will be skipped).
+   * Will enable all work needed to make sure all passes are communicated to the software.
+   *
+   * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */
+  void render_work_reschedule_on_cancel(RenderWork &render_work);
+
+  RenderWork get_render_work();
+
+  /* Report that the path tracer started to work, after scene update and loading kernels. */
+  void report_work_begin(const RenderWork &render_work);
+
+  /* Report time (in seconds) which corresponding part of work took. */
+  void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
+  void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_denoise_time(const RenderWork &render_work, double time);
+  void report_display_update_time(const RenderWork &render_work, double time);
+  void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed);
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+ protected:
+  /* Check whether all work has been scheduled and time limit was not exceeded.
+   *
+   * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some
+   * extra work needs to be scheduled to denoise and write final result. */
+  bool done() const;
+
+  /* Update scheduling state for a newely scheduled work.
+   * Takes care of things like checking whether work was ever denoised, tile was written and states
+   * like that. */
+  void update_state_for_render_work(const RenderWork &render_work);
+
+  /* Returns true if any work was scheduled. */
+  bool set_postprocess_render_work(RenderWork *render_work);
+
+  /*  Set work which is to be performed after all tiles has been rendered. */
+  void set_full_frame_render_work(RenderWork *render_work);
+
+  /* Update start resolution divider based on the accumulated timing information, preserving nice
+   * feeling navigation feel. */
+  void update_start_resolution_divider();
+
+  /* Calculate desired update interval in seconds based on the current timings and settings.
+   * Will give an interval which provides good feeling updates during viewport navigation. */
+  double guess_viewport_navigation_update_interval_in_seconds() const;
+
+  /* Check whether denoising is active during interactive update while resolution divider is not
+   * unit. */
+  bool is_denoise_active_during_update() const;
+
+  /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at
+   * lower samples and near the beginning of rendering, updates happen more often, but with higher
+   * number of samples and later in the render, updates happen less often but device occupancy
+   * goes higher. */
+  double guess_display_update_interval_in_seconds() const;
+  double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const;
+  double guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      int num_rendered_samples) const;
+
+  /* Calculate number of samples which can be rendered within current desred update interval which
+   * is calculated by `guess_update_interval_in_seconds()`. */
+  int calculate_num_samples_per_update() const;
+
+  /* Get start sample and the number of samples which are to be path traces in the current work. */
+  int get_start_sample_to_path_trace() const;
+  int get_num_samples_to_path_trace() const;
+
+  /* Calculate how many samples there are to be rendered for the very first path trace after reset.
+   */
+  int get_num_samples_during_navigation(int resolution_divier) const;
+
+  /* Whether adaptive sampling convergence check and filter is to happen. */
+  bool work_need_adaptive_filter() const;
+
+  /* Calculate thretshold for adaptive sampling. */
+  float work_adaptive_threshold() const;
+
+  /* Check whether current work needs denoising.
+   * Denoising is not needed if the denoiser is not configured, or when denosiing is happening too
+   * often.
+   *
+   * The delayed will be true when the denoiser is configured for use, but it was delayed for a
+   * later sample, to reduce overhead.
+   *
+   * ready_to_display will be false if we may have a denoised result that is outdated due to
+   * increased samples. */
+  bool work_need_denoise(bool &delayed, bool &ready_to_display);
+
+  /* Check whether current work need to update display.
+   *
+   * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */
+  bool work_need_update_display(const bool denoiser_delayed);
+
+  /* Check whether it is time to perform rebalancing for the render work, */
+  bool work_need_rebalance();
+
+  /* Check whether timing of the given work are usable to store timings in the `first_render_time_`
+   * for the resolution divider calculation. */
+  bool work_is_usable_for_first_render_estimation(const RenderWork &render_work);
+
+  /* Check whether timing report about the given work need to reset accumulated average time. */
+  bool work_report_reset_average(const RenderWork &render_work);
+
+  /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+   * information in the state so that rendering is considered finished, and is possible to report
+   * average render time information. */
+  void check_time_limit_reached();
+
+  /* Helper class to keep track of task timing.
+   *
+   * Contains two parts: wall time and average. The wall time is an actual wall time of how long it
+   * took to complete all tasks of a type. Is always advanced when PathTracer reports time update.
+   *
+   * The average time is used for scheduling purposes. It is estimated to be a time of how long it
+   * takes to perform task on the final resolution. */
+  class TimeWithAverage {
+   public:
+    inline void reset()
+    {
+      total_wall_time_ = 0.0;
+
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+    inline void add_wall(double time)
+    {
+      total_wall_time_ += time;
+    }
+
+    inline void add_average(double time, int num_measurements = 1)
+    {
+      average_time_accumulator_ += time;
+      num_average_times_ += num_measurements;
+    }
+
+    inline double get_wall() const
+    {
+      return total_wall_time_;
+    }
+
+    inline double get_average() const
+    {
+      if (num_average_times_ == 0) {
+        return 0;
+      }
+      return average_time_accumulator_ / num_average_times_;
+    }
+
+    inline void reset_average()
+    {
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+   protected:
+    double total_wall_time_ = 0.0;
+
+    double average_time_accumulator_ = 0.0;
+    int num_average_times_ = 0;
+  };
+
+  struct {
+    int resolution_divider = 1;
+
+    /* Number of rendered samples on top of the start sample. */
+    int num_rendered_samples = 0;
+
+    /* Point in time the latest GPUDisplay work has been scheduled. */
+    double last_display_update_time = 0.0;
+    /* Value of -1 means display was never updated. */
+    int last_display_update_sample = -1;
+
+    /* Point in time at which last rebalance has been performed. */
+    double last_rebalance_time = 0.0;
+
+    /* Number of rebalance works which has been requested to be performed.
+     * The path tracer might ignore the work if there is a single device rendering. */
+    int num_rebalance_requested = 0;
+
+    /* Number of rebalance works handled which did change balance across devices. */
+    int num_rebalance_changes = 0;
+
+    bool need_rebalance_at_next_work = false;
+
+    /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across
+     * devices. */
+    bool last_rebalance_changed = false;
+
+    /* Threshold for adaptive sampling which will be scheduled to work when not using progressive
+     * noise floor. */
+    float adaptive_sampling_threshold = 0.0f;
+
+    bool last_work_tile_was_denoised = false;
+    bool tile_result_was_written = false;
+    bool postprocess_work_scheduled = false;
+    bool full_frame_work_scheduled = false;
+    bool full_frame_was_written = false;
+
+    bool path_trace_finished = false;
+    bool time_limit_reached = false;
+
+    /* Time at which rendering started and finished. */
+    double start_render_time = 0.0;
+    double end_render_time = 0.0;
+
+    /* Measured occupancy of the render devices measured normalized to the number of samples.
+     *
+     * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+     * previous work was rendered. */
+    int occupancy_num_samples = 0;
+    float occupancy = 1.0f;
+  } state_;
+
+  /* Timing of tasks which were performed at the very first render work at 100% of the
+   * resolution. This timing information is used to estimate resolution divider for fats
+   * navigation. */
+  struct {
+    double path_trace_per_sample;
+    double denoise_time;
+    double display_update_time;
+  } first_render_time_;
+
+  TimeWithAverage path_trace_time_;
+  TimeWithAverage adaptive_filter_time_;
+  TimeWithAverage denoise_time_;
+  TimeWithAverage display_update_time_;
+  TimeWithAverage rebalance_time_;
+
+  /* Whether cryptomatte-related work will be scheduled. */
+  bool need_schedule_cryptomatte_ = false;
+
+  /* Whether to schedule device load rebalance works.
+   * Rebalancing requires some special treatment for update intervals and such, so if it's known
+   * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully
+   * ignore rebalancing logic. */
+  bool need_schedule_rebalance_works_ = false;
+
+  /* Path tracing work will be scheduled for samples from within
+   * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */
+  int start_sample_ = 0;
+  int num_samples_ = 0;
+
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit_ = 0.0;
+
+  /* Headless rendering without interface. */
+  bool headless_;
+
+  /* Background (offline) rendering. */
+  bool background_;
+
+  /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other
+   * types of hi-dpi displays. */
+  int pixel_size_ = 1;
+
+  TileManager &tile_manager_;
+
+  BufferParams buffer_params_;
+  DenoiseParams denoiser_params_;
+
+  AdaptiveSampling adaptive_sampling_;
+
+  /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise
+   * level. */
+  bool use_progressive_noise_floor_ = false;
+
+  /* Default value for the resolution divider which will be used when there is no render time
+   * information available yet.
+   * It is also what defines the upper limit of the automatically calculated resolution divider. */
+  int default_start_resolution_divider_ = 1;
+
+  /* Initial resolution divider which will be used on render scheduler reset. */
+  int start_resolution_divider_ = 0;
+
+  /* Calculate smallest resolution divider which will bring down actual rendering time below the
+   * desired one. This call assumes linear dependency of render time from number of pixels
+   * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time
+   * down by a factor of 4. */
+  int calculate_resolution_divider_for_time(double desired_time, double actual_time);
+};
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution);
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..465b4a8d4da
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+  DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+                      const int max_num_points,
+                      const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+                      const function<void(device_vector<float4> &)> &read_output)
+{
+  bool first_device = true;
+  bool success = true;
+
+  device_->foreach_device([&](Device *device) {
+    if (!first_device) {
+      LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+                    "single device.";
+      return;
+    }
+    first_device = false;
+
+    device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+    /* Allocate and copy device buffers. */
+    DCHECK_EQ(input.device, device);
+    DCHECK_EQ(output.device, device);
+    DCHECK_LE(output.size(), input.size());
+
+    input.alloc(max_num_points);
+    int num_points = fill_input(input);
+    if (num_points == 0) {
+      return;
+    }
+
+    input.copy_to_device();
+    output.alloc(num_points);
+    output.zero_to_device();
+
+    /* Evaluate on CPU or GPU. */
+    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+                                                  eval_gpu(device, type, input, output);
+
+    /* Copy data back from device if not cancelled. */
+    if (success) {
+      output.copy_from_device(0, 1, output.size());
+      read_output(output);
+    }
+
+    input.free();
+    output.free();
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+  /* Find required kernel function. */
+  const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+  /* Simple parallel_for over all work items. */
+  const int64_t work_size = output.size();
+  KernelShaderEvalInput *input_data = input.data();
+  float4 *output_data = output.data();
+  bool success = true;
+
+  tbb::task_arena local_arena(device->info.cpu_threads);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+      /* TODO: is this fast enough? */
+      if (progress_.get_cancel()) {
+        success = false;
+        return;
+      }
+
+      const int thread_index = tbb::this_task_arena::current_thread_index();
+      KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+      switch (type) {
+        case SHADER_EVAL_DISPLACE:
+          kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+          break;
+        case SHADER_EVAL_BACKGROUND:
+          kernels.shader_eval_background(kg, input_data, output_data, work_index);
+          break;
+      }
+    });
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  /* Find required kernel function. */
+  DeviceKernel kernel;
+  switch (type) {
+    case SHADER_EVAL_DISPLACE:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+      break;
+    case SHADER_EVAL_BACKGROUND:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+      break;
+  };
+
+  /* Create device queue. */
+  unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+  queue->init_execution();
+
+  /* Execute work on GPU in chunk, so we can cancel.
+   * TODO : query appropriate size from device.*/
+  const int chunk_size = 65536;
+
+  const int work_size = output.size();
+  void *d_input = (void *)input.device_pointer;
+  void *d_output = (void *)output.device_pointer;
+
+  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int d_work_size = min(chunk_size, work_size - d_offset);
+    void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+    queue->enqueue(kernel, d_work_size, args);
+    queue->synchronize();
+
+    if (progress_.get_cancel()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
new file mode 100644
index 00000000000..7dbf334b8d7
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_function.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class Progress;
+
+enum ShaderEvalType {
+  SHADER_EVAL_DISPLACE,
+  SHADER_EVAL_BACKGROUND,
+};
+
+/* ShaderEval class performs shader evaluation for background light and displacement. */
+class ShaderEval {
+ public:
+  ShaderEval(Device *device, Progress &progress);
+
+  /* Evaluate shader at points specified by KernelShaderEvalInput and write out
+   * RGBA colors to output. */
+  bool eval(const ShaderEvalType type,
+            const int max_num_points,
+            const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+            const function<void(device_vector<float4> &)> &read_output);
+
+ protected:
+  bool eval_cpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+  bool eval_gpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+
+  Device *device_;
+  Progress &progress_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
new file mode 100644
index 00000000000..3387b7bedf1
--- /dev/null
+++ b/intern/cycles/integrator/tile.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/tile.h"
+
+#include "util/util_logging.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size)
+{
+  os << "size: (" << tile_size.width << ", " << tile_size.height << ")";
+  os << ", num_samples: " << tile_size.num_samples;
+  return os;
+}
+
+ccl_device_inline uint round_down_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return prev_power_of_two(x);
+}
+
+ccl_device_inline uint round_up_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return next_power_of_two(x);
+}
+
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states)
+{
+  if (max_num_path_states == 1) {
+    /* Simple case: avoid any calculation, which could cause rounding issues. */
+    return TileSize(1, 1, 1);
+  }
+
+  const int64_t num_pixels = image_size.x * image_size.y;
+  const int64_t num_pixel_samples = num_pixels * num_samples;
+
+  if (max_num_path_states >= num_pixel_samples) {
+    /* Image fully fits into the state (could be border render, for example). */
+    return TileSize(image_size.x, image_size.y, num_samples);
+  }
+
+  /* The idea here is to keep number of samples per tile as much as possible to improve coherency
+   * across threads.
+   *
+   * Some general ideas:
+   *  - Prefer smaller tiles with more samples, which improves spatial coherency of paths.
+   *  - Keep values a power of two, for more integer fit into the maximum number of paths. */
+
+  TileSize tile_size;
+
+  /* Calculate tile size as if it is the most possible one to fit an entire range of samples.
+   * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling
+   * multiple tiles with the same coordinates rendering different samples. */
+  const int num_path_states_per_sample = max_num_path_states / num_samples;
+  if (num_path_states_per_sample != 0) {
+    tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample)));
+    tile_size.height = tile_size.width;
+  }
+  else {
+    tile_size.width = tile_size.height = 1;
+  }
+
+  if (num_samples == 1) {
+    tile_size.num_samples = 1;
+  }
+  else {
+    /* Heuristic here is to have more uniform division of the sample range: for example prefer
+     * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */
+    tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))),
+                                static_cast<uint>(num_samples));
+
+    const int tile_area = tile_size.width / tile_size.height;
+    tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area);
+  }
+
+  DCHECK_GE(tile_size.width, 1);
+  DCHECK_GE(tile_size.height, 1);
+  DCHECK_GE(tile_size.num_samples, 1);
+  DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states);
+
+  return tile_size;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
new file mode 100644
index 00000000000..d0824843ddb
--- /dev/null
+++ b/intern/cycles/integrator/tile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct TileSize {
+  TileSize() = default;
+
+  inline TileSize(int width, int height, int num_samples)
+      : width(width), height(height), num_samples(num_samples)
+  {
+  }
+
+  inline bool operator==(const TileSize &other) const
+  {
+    return width == other.width && height == other.height && num_samples == other.num_samples;
+  }
+  inline bool operator!=(const TileSize &other) const
+  {
+    return !(*this == other);
+  }
+
+  int width = 0, height = 0;
+  int num_samples = 0;
+};
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
+
+/* Calculate tile size which is best suitable for rendering image of a given size with given number
+ * of active path states.
+ * Will attempt to provide best guess to keep path tracing threads of a device as localized as
+ * possible, and have as many threads active for every tile as possible. */
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
new file mode 100644
index 00000000000..9f96fe3632b
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_balancer.h"
+
+#include "util/util_math.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  if (num_infos == 1) {
+    work_balance_infos[0].weight = 1.0;
+    return;
+  }
+
+  /* There is no statistics available, so start with an equal distribution. */
+  const double weight = 1.0 / num_infos;
+  for (WorkBalanceInfo &balance_info : work_balance_infos) {
+    balance_info.weight = weight;
+  }
+}
+
+static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos)
+{
+  double total_time = 0;
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    total_time += info.time_spent;
+  }
+  return total_time;
+}
+
+/* The balance is based on equalizing time which devices spent performing a task. Assume that
+ * average of the observed times is usable for estimating whether more or less work is to be
+ * scheduled, and how difference in the work scheduling is needed. */
+
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  const double total_time = calculate_total_time(work_balance_infos);
+  const double time_average = total_time / num_infos;
+
+  double total_weight = 0;
+  vector<double> new_weights;
+  new_weights.reserve(num_infos);
+
+  /* Equalize the overall average time. This means that we don't make it so every work will perform
+   * amount of work based on the current average, but that after the weights changes the time will
+   * equalize.
+   * Can think of it that if one of the devices is 10% faster than another, then one device needs
+   * to do 5% less of the current work, and another needs to do 5% more. */
+  const double lerp_weight = 1.0 / num_infos;
+
+  bool has_big_difference = false;
+
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    const double time_target = lerp(info.time_spent, time_average, lerp_weight);
+    const double new_weight = info.weight * time_target / info.time_spent;
+    new_weights.push_back(new_weight);
+    total_weight += new_weight;
+
+    if (std::fabs(1.0 - time_target / time_average) > 0.02) {
+      has_big_difference = true;
+    }
+  }
+
+  if (!has_big_difference) {
+    return false;
+  }
+
+  const double total_weight_inv = 1.0 / total_weight;
+  for (int i = 0; i < num_infos; ++i) {
+    WorkBalanceInfo &info = work_balance_infos[i];
+    info.weight = new_weights[i] * total_weight_inv;
+    info.time_spent = 0;
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
new file mode 100644
index 00000000000..94e20ecf054
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct WorkBalanceInfo {
+  /* Time spent performing corresponding work. */
+  double time_spent = 0;
+
+  /* Average occupancy of the device while performing the work. */
+  float occupancy = 1.0f;
+
+  /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
+   * the big tile which is to be rendered on the device). */
+  double weight = 1.0;
+};
+
+/* Balance work for an initial render interation, before any statistics is known. */
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos);
+
+/* Rebalance work after statistics has been accumulated.
+ * Returns true if the balancing did change. */
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
new file mode 100644
index 00000000000..3fc99d5b74d
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_tile_scheduler.h"
+
+#include "device/device_queue.h"
+#include "integrator/tile.h"
+#include "render/buffers.h"
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+WorkTileScheduler::WorkTileScheduler()
+{
+}
+
+void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
+{
+  max_num_path_states_ = max_num_path_states;
+}
+
+void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num)
+{
+  /* Image buffer parameters. */
+  image_full_offset_px_.x = buffer_params.full_x;
+  image_full_offset_px_.y = buffer_params.full_y;
+
+  image_size_px_ = make_int2(buffer_params.width, buffer_params.height);
+
+  offset_ = buffer_params.offset;
+  stride_ = buffer_params.stride;
+
+  /* Samples parameters. */
+  sample_start_ = sample_start;
+  samples_num_ = samples_num;
+
+  /* Initialize new scheduling. */
+  reset_scheduler_state();
+}
+
+void WorkTileScheduler::reset_scheduler_state()
+{
+  tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_);
+
+  VLOG(3) << "Will schedule tiles of size " << tile_size_;
+
+  if (VLOG_IS_ON(3)) {
+    /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
+     * and purely focusing on the number of used path states. */
+    const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+                                        tile_size_.num_samples;
+    const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+    VLOG(3) << "Number of unused path states: "
+            << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+  }
+
+  num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+  num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+
+  total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
+  num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
+
+  next_work_index_ = 0;
+  total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
+}
+
+bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size)
+{
+  /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because
+   * the path trace work can decice to use smaller tile sizes and greedily schedule multiple tiles,
+   * improving overall device occupancy.
+   * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling
+   * limit". */
+
+  DCHECK_NE(max_num_path_states_, 0);
+
+  const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1);
+  if (work_index >= total_work_size_) {
+    return false;
+  }
+
+  const int sample_range_index = work_index % num_tiles_per_sample_range_;
+  const int start_sample = sample_range_index * tile_size_.num_samples;
+  const int tile_index = work_index / num_tiles_per_sample_range_;
+  const int tile_y = tile_index / num_tiles_x_;
+  const int tile_x = tile_index - tile_y * num_tiles_x_;
+
+  KernelWorkTile work_tile;
+  work_tile.x = tile_x * tile_size_.width;
+  work_tile.y = tile_y * tile_size_.height;
+  work_tile.w = tile_size_.width;
+  work_tile.h = tile_size_.height;
+  work_tile.start_sample = sample_start_ + start_sample;
+  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.offset = offset_;
+  work_tile.stride = stride_;
+
+  work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
+  work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);
+
+  work_tile.x += image_full_offset_px_.x;
+  work_tile.y += image_full_offset_px_.y;
+
+  const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+  DCHECK_GT(tile_work_size, 0);
+
+  if (max_work_size && tile_work_size > max_work_size) {
+    /* The work did not fit into the requested limit of the work size. Unschedule the tile,
+     * allowing others (or ourselves later one) to pick it up.
+     *
+     * TODO: Such temporary decrement is not ideal, since it might lead to situation when another
+     * device sees there is nothing to be done, finishing its work and leaving all work to be
+     * done by us. */
+    atomic_fetch_and_add_int32(&next_work_index_, -1);
+    return false;
+  }
+
+  *work_tile_ = work_tile;
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
new file mode 100644
index 00000000000..e4c8f701259
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/tile.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+struct KernelWorkTile;
+
+/* Scheduler of device work tiles.
+ * Takes care of feeding multiple devices running in parallel a work which needs to be done. */
+class WorkTileScheduler {
+ public:
+  WorkTileScheduler();
+
+  /* MAximum path states which are allowed to be used by a single scheduled work tile.
+   *
+   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
+   * this number of states. */
+  void set_max_num_path_states(int max_num_path_states);
+
+  /* Scheduling will happen for pixels within a big tile denotes by its parameters. */
+  void reset(const BufferParams &buffer_params, int sample_start, int samples_num);
+
+  /* Get work for a device.
+   * Returns true if there is still work to be done and initialize the work tile to all
+   * parameters of this work. If there is nothing remaining to be done, returns false and the
+   * work tile is kept unchanged.
+   *
+   * Optionally pass max_work_size to do nothing if there is no tile small enough. */
+  bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0);
+
+ protected:
+  void reset_scheduler_state();
+
+  /* Maximum allowed path states to be used.
+   *
+   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
+   * number of path states is kind of a detail. Is there a more generic term from the scheduler
+   * point of view? */
+  int max_num_path_states_ = 0;
+
+  /* Offset in pixels within a global buffer. */
+  int2 image_full_offset_px_ = make_int2(0, 0);
+
+  /* dimensions of the currently rendering image in pixels. */
+  int2 image_size_px_ = make_int2(0, 0);
+
+  /* Offset and stride of the buffer within which scheduing is happenning.
+   * Will be passed over to the KernelWorkTile. */
+  int offset_, stride_;
+
+  /* Start sample of index and number of samples which are to be rendered.
+   * The scheduler will cover samples range of [start, start + num] over the entire image
+   * (splitting into a smaller work tiles). */
+  int sample_start_ = 0;
+  int samples_num_ = 0;
+
+  /* Tile size which be scheduled for rendering. */
+  TileSize tile_size_;
+
+  /* Number of tiles in X and Y axis of the image. */
+  int num_tiles_x_, num_tiles_y_;
+
+  /* Total number of tiles on the image.
+   * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`.
+   *
+   * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value
+   * in the `get_work()`? */
+  int total_tiles_num_ = 0;
+
+  /* In the case when the number of sam[les in the `tile_size_` is lower than samples_num_ denotes
+   * how many tiles are to be "stacked" to cover the entire requested range of samples. */
+  int num_tiles_per_sample_range_ = 0;
+
+  int next_work_index_ = 0;
+  int total_work_size_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 0ce33c51778..4196539a9b1 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -22,68 +22,22 @@ set(INC_SYS
 
 )
 
-set(SRC_CPU_KERNELS
-  kernels/cpu/kernel.cpp
-  kernels/cpu/kernel_sse2.cpp
-  kernels/cpu/kernel_sse3.cpp
-  kernels/cpu/kernel_sse41.cpp
-  kernels/cpu/kernel_avx.cpp
-  kernels/cpu/kernel_avx2.cpp
-  kernels/cpu/kernel_split.cpp
-  kernels/cpu/kernel_split_sse2.cpp
-  kernels/cpu/kernel_split_sse3.cpp
-  kernels/cpu/kernel_split_sse41.cpp
-  kernels/cpu/kernel_split_avx.cpp
-  kernels/cpu/kernel_split_avx2.cpp
-  kernels/cpu/filter.cpp
-  kernels/cpu/filter_sse2.cpp
-  kernels/cpu/filter_sse3.cpp
-  kernels/cpu/filter_sse41.cpp
-  kernels/cpu/filter_avx.cpp
-  kernels/cpu/filter_avx2.cpp
+set(SRC_DEVICE_CPU
+  device/cpu/kernel.cpp
+  device/cpu/kernel_sse2.cpp
+  device/cpu/kernel_sse3.cpp
+  device/cpu/kernel_sse41.cpp
+  device/cpu/kernel_avx.cpp
+  device/cpu/kernel_avx2.cpp
 )
 
-set(SRC_CUDA_KERNELS
-  kernels/cuda/kernel.cu
-  kernels/cuda/kernel_split.cu
-  kernels/cuda/filter.cu
+set(SRC_DEVICE_CUDA
+  device/cuda/kernel.cu
 )
 
-set(SRC_OPENCL_KERNELS
-  kernels/opencl/kernel_adaptive_stopping.cl
-  kernels/opencl/kernel_adaptive_filter_x.cl
-  kernels/opencl/kernel_adaptive_filter_y.cl
-  kernels/opencl/kernel_adaptive_adjust_samples.cl
-  kernels/opencl/kernel_bake.cl
-  kernels/opencl/kernel_base.cl
-  kernels/opencl/kernel_displace.cl
-  kernels/opencl/kernel_background.cl
-  kernels/opencl/kernel_state_buffer_size.cl
-  kernels/opencl/kernel_split_bundle.cl
-  kernels/opencl/kernel_data_init.cl
-  kernels/opencl/kernel_path_init.cl
-  kernels/opencl/kernel_queue_enqueue.cl
-  kernels/opencl/kernel_scene_intersect.cl
-  kernels/opencl/kernel_lamp_emission.cl
-  kernels/opencl/kernel_do_volume.cl
-  kernels/opencl/kernel_indirect_background.cl
-  kernels/opencl/kernel_shader_setup.cl
-  kernels/opencl/kernel_shader_sort.cl
-  kernels/opencl/kernel_shader_eval.cl
-  kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
-  kernels/opencl/kernel_subsurface_scatter.cl
-  kernels/opencl/kernel_direct_lighting.cl
-  kernels/opencl/kernel_shadow_blocked_ao.cl
-  kernels/opencl/kernel_shadow_blocked_dl.cl
-  kernels/opencl/kernel_enqueue_inactive.cl
-  kernels/opencl/kernel_next_iteration_setup.cl
-  kernels/opencl/kernel_indirect_subsurface.cl
-  kernels/opencl/kernel_buffer_update.cl
-  kernels/opencl/filter.cl
-)
-
-set(SRC_OPTIX_KERNELS
-  kernels/optix/kernel_optix.cu
+set(SRC_DEVICE_OPTIX
+  device/optix/kernel.cu
+  device/optix/kernel_shader_raytrace.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -105,63 +59,56 @@ set(SRC_HEADERS
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
-  kernel_compat_cpu.h
-  kernel_compat_cuda.h
-  kernel_compat_optix.h
-  kernel_compat_opencl.h
   kernel_differential.h
   kernel_emission.h
   kernel_film.h
-  kernel_globals.h
   kernel_id_passes.h
   kernel_jitter.h
   kernel_light.h
   kernel_light_background.h
   kernel_light_common.h
+  kernel_lookup_table.h
   kernel_math.h
   kernel_montecarlo.h
   kernel_passes.h
-  kernel_path.h
-  kernel_path_branched.h
-  kernel_path_common.h
   kernel_path_state.h
-  kernel_path_surface.h
-  kernel_path_subsurface.h
-  kernel_path_volume.h
   kernel_profiling.h
   kernel_projection.h
-  kernel_queues.h
   kernel_random.h
   kernel_shader.h
-  kernel_shadow.h
-  kernel_subsurface.h
+  kernel_shadow_catcher.h
   kernel_textures.h
   kernel_types.h
-  kernel_volume.h
   kernel_work_stealing.h
   kernel_write_passes.h
 )
 
-set(SRC_KERNELS_CPU_HEADERS
-  kernel.h
-  kernels/cpu/kernel_cpu.h
-  kernels/cpu/kernel_cpu_impl.h
-  kernels/cpu/kernel_cpu_image.h
-  kernels/cpu/filter_cpu.h
-  kernels/cpu/filter_cpu_impl.h
+set(SRC_DEVICE_CPU_HEADERS
+  device/cpu/compat.h
+  device/cpu/image.h
+  device/cpu/globals.h
+  device/cpu/kernel.h
+  device/cpu/kernel_arch.h
+  device/cpu/kernel_arch_impl.h
 )
-
-set(SRC_KERNELS_CUDA_HEADERS
-  kernels/cuda/kernel_config.h
-  kernels/cuda/kernel_cuda_image.h
+set(SRC_DEVICE_GPU_HEADERS
+  device/gpu/image.h
+  device/gpu/kernel.h
+  device/gpu/parallel_active_index.h
+  device/gpu/parallel_prefix_sum.h
+  device/gpu/parallel_reduce.h
+  device/gpu/parallel_sorted_index.h
 )
 
-set(SRC_KERNELS_OPTIX_HEADERS
+set(SRC_DEVICE_CUDA_HEADERS
+  device/cuda/compat.h
+  device/cuda/config.h
+  device/cuda/globals.h
 )
 
-set(SRC_KERNELS_OPENCL_HEADERS
-  kernels/opencl/kernel_split_function.h
-  kernels/opencl/kernel_opencl_image.h
+set(SRC_DEVICE_OPTIX_HEADERS
+  device/optix/compat.h
+  device/optix/globals.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -259,25 +206,32 @@ set(SRC_GEOM_HEADERS
   geom/geom_object.h
   geom/geom_patch.h
   geom/geom_primitive.h
+  geom/geom_shader_data.h
   geom/geom_subd_triangle.h
   geom/geom_triangle.h
   geom/geom_triangle_intersect.h
   geom/geom_volume.h
 )
 
-set(SRC_FILTER_HEADERS
-  filter/filter.h
-  filter/filter_defines.h
-  filter/filter_features.h
-  filter/filter_features_sse.h
-  filter/filter_kernel.h
-  filter/filter_nlm_cpu.h
-  filter/filter_nlm_gpu.h
-  filter/filter_prefilter.h
-  filter/filter_reconstruction.h
-  filter/filter_transform.h
-  filter/filter_transform_gpu.h
-  filter/filter_transform_sse.h
+set(SRC_INTEGRATOR_HEADERS
+  integrator/integrator_init_from_bake.h
+  integrator/integrator_init_from_camera.h
+  integrator/integrator_intersect_closest.h
+  integrator/integrator_intersect_shadow.h
+  integrator/integrator_intersect_subsurface.h
+  integrator/integrator_intersect_volume_stack.h
+  integrator/integrator_megakernel.h
+  integrator/integrator_shade_background.h
+  integrator/integrator_shade_light.h
+  integrator/integrator_shade_shadow.h
+  integrator/integrator_shade_surface.h
+  integrator/integrator_shade_volume.h
+  integrator/integrator_state.h
+  integrator/integrator_state_flow.h
+  integrator/integrator_state_template.h
+  integrator/integrator_state_util.h
+  integrator/integrator_subsurface.h
+  integrator/integrator_volume_stack.h
 )
 
 set(SRC_UTIL_HEADERS
@@ -333,36 +287,6 @@ set(SRC_UTIL_HEADERS
   ../util/util_types_vector3_impl.h
 )
 
-set(SRC_SPLIT_HEADERS
-  split/kernel_adaptive_adjust_samples.h
-  split/kernel_adaptive_filter_x.h
-  split/kernel_adaptive_filter_y.h
-  split/kernel_adaptive_stopping.h
-  split/kernel_branched.h
-  split/kernel_buffer_update.h
-  split/kernel_data_init.h
-  split/kernel_direct_lighting.h
-  split/kernel_do_volume.h
-  split/kernel_enqueue_inactive.h
-  split/kernel_holdout_emission_blurring_pathtermination_ao.h
-  split/kernel_indirect_background.h
-  split/kernel_indirect_subsurface.h
-  split/kernel_lamp_emission.h
-  split/kernel_next_iteration_setup.h
-  split/kernel_path_init.h
-  split/kernel_queue_enqueue.h
-  split/kernel_scene_intersect.h
-  split/kernel_shader_setup.h
-  split/kernel_shader_sort.h
-  split/kernel_shader_eval.h
-  split/kernel_shadow_blocked_ao.h
-  split/kernel_shadow_blocked_dl.h
-  split/kernel_split_common.h
-  split/kernel_split_data.h
-  split/kernel_split_data_types.h
-  split/kernel_subsurface_scatter.h
-)
-
 set(LIB
 
 )
@@ -393,21 +317,17 @@ if(WITH_CYCLES_CUDA_BINARIES)
   endif()
 
   # build for each arch
-  set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
+  set(cuda_sources device/cuda/kernel.cu
     ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
+    ${SRC_DEVICE_GPU_HEADERS}
+    ${SRC_DEVICE_CUDA_HEADERS}
     ${SRC_BVH_HEADERS}
     ${SRC_SVM_HEADERS}
     ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
     ${SRC_CLOSURE_HEADERS}
     ${SRC_UTIL_HEADERS}
   )
-  set(cuda_filter_sources kernels/cuda/filter.cu
-    ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
-    ${SRC_FILTER_HEADERS}
-    ${SRC_UTIL_HEADERS}
-  )
   set(cuda_cubins)
 
   macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
@@ -427,7 +347,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       endif()
     endif()
 
-    set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+    set(cuda_kernel_src "/device/cuda/${name}.cu")
 
     set(cuda_flags ${flags}
       -D CCL_NAMESPACE_BEGIN=
@@ -435,7 +355,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -D NVCC
       -m ${CUDA_BITS}
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
-      -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
       --use_fast_math
       -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
@@ -523,14 +443,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
     endif()
     if(DEFINED cuda_nvcc_executable AND DEFINED cuda_toolkit_root_dir)
       # Compile regular kernel
-      CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} filter "" "${cuda_filter_sources}" FALSE)
       CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${cuda_sources}" FALSE)
 
-      if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
-        # Compile split kernel
-        CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel_split "-D __SPLIT__" "${cuda_sources}" FALSE)
-      endif()
-
       if(WITH_CYCLES_CUDA_BUILD_SERIAL)
         set(prev_arch ${arch})
       endif()
@@ -547,15 +461,15 @@ endif()
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
-  macro(CYCLES_OPTIX_KERNEL_ADD name flags)
-    set(input "kernels/optix/kernel_optix.cu")
+  macro(CYCLES_OPTIX_KERNEL_ADD name input flags)
     set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
 
     set(cuda_flags ${flags}
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
-      -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
       --use_fast_math
+      -Wno-deprecated-gpu-targets
       -o ${output})
 
     if(WITH_NANOVDB)
@@ -580,11 +494,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND ${CUBIN_CC_ENV}
@@ -603,11 +519,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND
@@ -624,8 +542,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
     delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
   endmacro()
 
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__")
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix
+    "device/optix/kernel.cu"
+    "")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix_shader_raytrace
+    "device/optix/kernel_shader_raytrace.cu"
+    "--keep-device-functions")
 
   add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
   cycles_set_solution_folder(cycles_kernel_optix)
@@ -659,62 +583,47 @@ if(WITH_COMPILER_ASAN)
   endif()
 endif()
 
-set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
 if(CXX_HAS_SSE)
-  set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-  set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-  set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 cycles_add_library(cycles_kernel "${LIB}"
-  ${SRC_CPU_KERNELS}
-  ${SRC_CUDA_KERNELS}
-  ${SRC_OPTIX_KERNELS}
-  ${SRC_OPENCL_KERNELS}
+  ${SRC_DEVICE_CPU}
+  ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_OPTIX}
   ${SRC_HEADERS}
-  ${SRC_KERNELS_CPU_HEADERS}
-  ${SRC_KERNELS_CUDA_HEADERS}
-  ${SRC_KERNELS_OPTIX_HEADERS}
-  ${SRC_KERNELS_OPENCL_HEADERS}
+  ${SRC_DEVICE_CPU_HEADERS}
+  ${SRC_DEVICE_GPU_HEADERS}
+  ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_OPTIX_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
-  ${SRC_FILTER_HEADERS}
   ${SRC_SVM_HEADERS}
   ${SRC_GEOM_HEADERS}
-  ${SRC_SPLIT_HEADERS}
+  ${SRC_INTEGRATOR_HEADERS}
 )
 
 source_group("bvh" FILES ${SRC_BVH_HEADERS})
 source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
-source_group("filter" FILES ${SRC_FILTER_HEADERS})
 source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
-source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
-source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
-source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
-source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
-source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
+source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
+source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
 source_group("svm" FILES ${SRC_SVM_HEADERS})
 
 if(WITH_CYCLES_CUDA)
@@ -724,31 +633,20 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
 
-# OpenCL kernel
-
-# set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
-# add_custom_command(
-#    OUTPUT ${KERNEL_PREPROCESSED}
-#    COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
-#    DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
-# add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
-# delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
+# Install kernel source for runtime compilation
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
-
 
 if(WITH_NANOVDB)
   set(SRC_NANOVDB_HEADERS
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index acf29cf1baf..539e9fd05fb 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -25,6 +25,8 @@
  * the code has been extended and modified to support more primitives and work
  * with CPU/CUDA/OpenCL. */
 
+#pragma once
+
 #ifdef __EMBREE__
 #  include "kernel/bvh/bvh_embree.h"
 #endif
@@ -152,13 +154,11 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray)
   return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
 }
 
-ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
                                           const Ray *ray,
                                           const uint visibility,
                                           Intersection *isect)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT);
-
 #ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -238,15 +238,13 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }
 
 #ifdef __BVH_LOCAL__
-ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
                                                 const Ray *ray,
                                                 LocalIntersection *local_isect,
                                                 int local_object,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF;
@@ -313,8 +311,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
         float3 dir = ray->D;
         float3 idir = ray->D;
         Transform ob_itfm;
-        rtc_ray.tfar = bvh_instance_motion_push(
-            kg, local_object, ray, &P, &dir, &idir, ray->t, &ob_itfm);
+        rtc_ray.tfar = ray->t *
+                       bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
         /* bvh_instance_motion_push() returns the inverse transform but
          * it's not needed here. */
         (void)ob_itfm;
@@ -353,15 +351,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      uint visibility,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)isect) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF;
@@ -401,17 +397,13 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
     ctx.isect_s = isect;
     ctx.max_hits = max_hits;
-    ctx.num_hits = 0;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
     kernel_embree_setup_ray(*ray, rtc_ray, visibility);
     rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
 
-    if (ctx.num_hits > max_hits) {
-      return true;
-    }
     *num_hits = ctx.num_hits;
-    return rtc_ray.tfar == -INFINITY;
+    return ctx.opaque_hit;
   }
 #    endif /* __EMBREE__ */
 
@@ -439,13 +431,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 #endif /* __SHADOW_RECORD_ALL__ */
 
 #ifdef __VOLUME__
-ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
                                                  const Ray *ray,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -498,14 +488,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
 #endif /* __VOLUME__ */
 
 #ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+ccl_device_intersect uint scene_intersect_volume_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
-
   if (!scene_intersect_valid(ray)) {
     return false;
   }
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
index 4605c3ea51d..092d770dcac 100644
--- a/intern/cycles/kernel/bvh/bvh_embree.h
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <embree3/rtcore_ray.h>
 #include <embree3/rtcore_scene.h>
 
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-// clang-format on
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
 
 #include "util/util_vector.h"
 
@@ -36,25 +35,29 @@ struct CCLIntersectContext {
     RAY_VOLUME_ALL = 4,
   } RayType;
 
-  KernelGlobals *kg;
+  const KernelGlobals *kg;
   RayType type;
 
   /* for shadow rays */
   Intersection *isect_s;
   int max_hits;
   int num_hits;
+  float max_t;
+  bool opaque_hit;
 
   /* for SSS Rays: */
   LocalIntersection *local_isect;
   int local_object_id;
   uint *lcg_state;
 
-  CCLIntersectContext(KernelGlobals *kg_, RayType type_)
+  CCLIntersectContext(const KernelGlobals *kg_, RayType type_)
   {
     kg = kg_;
     type = type_;
     max_hits = 1;
     num_hits = 0;
+    max_t = FLT_MAX;
+    opaque_hit = false;
     isect_s = NULL;
     local_isect = NULL;
     local_object_id = -1;
@@ -98,7 +101,7 @@ ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
   rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
 }
 
-ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_hit(const KernelGlobals *kg,
                                                  const RTCRay *ray,
                                                  const RTCHit *hit,
                                                  Intersection *isect)
@@ -123,7 +126,7 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
   isect->type = kernel_tex_fetch(__prim_type, isect->prim);
 }
 
-ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_sss_hit(const KernelGlobals *kg,
                                                      const RTCRay *ray,
                                                      const RTCHit *hit,
                                                      Intersection *isect,
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 4006c9c1632..90b9f410b29 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      LocalIntersection *local_isect,
                                      int local_object,
@@ -74,9 +74,9 @@ ccl_device_inline
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
     Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+    isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+    isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #endif
     object = local_object;
   }
@@ -196,7 +196,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          LocalIntersection *local_isect,
                                          int local_object,
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 5367bdb633c..15cd0f22213 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@
 
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
-ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(const KernelGlobals *kg,
                                                                 int node_addr,
                                                                 int child)
 {
@@ -28,7 +28,7 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
   return space;
 }
 
-ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(const KernelGlobals *kg,
                                                       const float3 P,
                                                       const float3 idir,
                                                       const float t,
@@ -76,7 +76,7 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(const KernelGlobals *kg,
                                                                const float3 P,
                                                                const float3 dir,
                                                                const float t,
@@ -102,7 +102,7 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg
   return tnear <= tfar;
 }
 
-ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(const KernelGlobals *kg,
                                                         const float3 P,
                                                         const float3 dir,
                                                         const float3 idir,
@@ -134,7 +134,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
   return mask;
 }
 
-ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(const KernelGlobals *kg,
                                               const float3 P,
                                               const float3 dir,
                                               const float3 idir,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 2e94b1d7c37..0ae36fccf9b 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint visibility,
@@ -68,10 +68,10 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-  int num_hits_in_instance = 0;
+  float t_world_to_instance = 1.0f;
 
   *num_hits = 0;
-  isect_array->t = tmax;
+  Intersection *isect = isect_array;
 
   /* traversal loop */
   do {
@@ -147,13 +147,14 @@ ccl_device_inline
 
             switch (p_type) {
               case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect, P, dir, isect_t, visibility, object, prim_addr);
                 break;
               }
 #if BVH_FEATURE(BVH_MOTION)
               case PRIMITIVE_MOTION_TRIANGLE: {
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 break;
               }
 #endif
@@ -163,8 +164,16 @@ ccl_device_inline
               case PRIMITIVE_CURVE_RIBBON:
               case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                hit = curve_intersect(
-                    kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                hit = curve_intersect(kg,
+                                      isect,
+                                      P,
+                                      dir,
+                                      isect_t,
+                                      visibility,
+                                      object,
+                                      prim_addr,
+                                      ray->time,
+                                      curve_type);
                 break;
               }
 #endif
@@ -176,27 +185,49 @@ ccl_device_inline
 
             /* shadow ray early termination */
             if (hit) {
+              /* Convert intersection distance to world space. */
+              isect->t /= t_world_to_instance;
+
               /* detect if this surface has a shader with transparent shadows */
 
               /* todo: optimize so primitive visibility flag indicates if
                * the primitive has a transparent shadow shader? */
-              const int flags = intersection_get_shader_flags(kg, isect_array);
+              const int flags = intersection_get_shader_flags(kg, isect);
 
-              /* if no transparent shadows, all light is blocked */
-              if (!(flags & SD_HAS_TRANSPARENT_SHADOW)) {
-                return true;
-              }
-              /* if maximum number of hits reached, block all light */
-              else if (*num_hits == max_hits) {
+              if (!(flags & SD_HAS_TRANSPARENT_SHADOW) || max_hits == 0) {
+                /* If no transparent shadows, all light is blocked and we can
+                 * stop immediately. */
                 return true;
               }
 
-              /* move on to next entry in intersections array */
-              isect_array++;
+              /* Increase the number of hits, possibly beyond max_hits, we will
+               * simply not record those and only keep the max_hits closest. */
               (*num_hits)++;
-              num_hits_in_instance++;
 
-              isect_array->t = isect_t;
+              if (*num_hits >= max_hits) {
+                /* If maximum number of hits reached, find the intersection with
+                 * the largest distance to potentially replace when another hit
+                 * is found. */
+                const int num_recorded_hits = min(max_hits, *num_hits);
+                float max_recorded_t = isect_array[0].t;
+                int max_recorded_hit = 0;
+
+                for (int i = 1; i < num_recorded_hits; i++) {
+                  if (isect_array[i].t > max_recorded_t) {
+                    max_recorded_t = isect_array[i].t;
+                    max_recorded_hit = i;
+                  }
+                }
+
+                isect = isect_array + max_recorded_hit;
+
+                /* Limit the ray distance and stop counting hits beyond this. */
+                isect_t = max_recorded_t * t_world_to_instance;
+              }
+              else {
+                /* Still have space for intersection, use next hit. */
+                isect = isect + 1;
+              }
             }
 
             prim_addr++;
@@ -207,13 +238,14 @@ ccl_device_inline
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+          t_world_to_instance = bvh_instance_motion_push(
+              kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+          t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
+          /* Convert intersection to object space. */
+          isect_t *= t_world_to_instance;
 
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -228,32 +260,19 @@ ccl_device_inline
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-
 #if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+      bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+      bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #endif
 
-        /* scale isect->t to adjust for instancing */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
+      /* Restore world space ray length. If max number of hits exceeded this
+       * distance is reduced to recorded only the closest hits. If not use
+       * the original ray length. */
+      isect_t = (max_hits && *num_hits > max_hits) ? isect->t : tmax;
 
       object = OBJECT_NONE;
+      t_world_to_instance = 1.0f;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
@@ -262,7 +281,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint visibility,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 89250a8d60a..a26d8c514f3 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -31,7 +31,7 @@
  * BVH_MOTION: motion blur rendering
  */
 
-ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint visibility)
@@ -136,7 +136,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
             case PRIMITIVE_TRIANGLE: {
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
+                if (triangle_intersect(
+                        kg, isect, P, dir, isect->t, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -149,7 +150,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
                 if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
+                        kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -166,8 +167,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
                 kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                const bool hit = curve_intersect(
-                    kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                const bool hit = curve_intersect(kg,
+                                                 isect,
+                                                 P,
+                                                 dir,
+                                                 isect->t,
+                                                 visibility,
+                                                 object,
+                                                 prim_addr,
+                                                 ray->time,
+                                                 curve_type);
                 if (hit) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -184,10 +193,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect->t = bvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+          isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+          isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
           ++stack_ptr;
@@ -218,7 +226,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 98e6ec25d15..6039e707fc3 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BVH_TYPES__
-#define __BVH_TYPES__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,5 +42,3 @@ CCL_NAMESPACE_BEGIN
 #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
 
 CCL_NAMESPACE_END
-
-#endif /* __BVH_TYPES__ */
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index b1faebce957..21384457b16 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -71,86 +71,6 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-/* This function should be used to compute a modified ray start position for
- * rays leaving from a surface. The algorithm slightly distorts flat surface
- * of a triangle. Surface is lifted by amount h along normal n in the incident
- * point. */
-
-ccl_device_inline float3 smooth_surface_offset(KernelGlobals *kg, ShaderData *sd, float3 Ng)
-{
-  float3 V[3], N[3];
-  triangle_vertices_and_normals(kg, sd->prim, V, N);
-
-  const float u = sd->u, v = sd->v;
-  const float w = 1 - u - v;
-  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
-  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
-
-  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
-
-  /* Parabolic approximation */
-  float a = dot(N[2] - N[0], V[0] - V[2]);
-  float b = dot(N[2] - N[1], V[1] - V[2]);
-  float c = dot(N[1] - N[0], V[1] - V[0]);
-  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
-
-  /* Check flipped normals */
-  if (dot(n, Ng) > 0) {
-    /* Local linear envelope */
-    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
-    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
-    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
-    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
-    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
-    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
-    h = max(min(min(h0, h1), h2), h * 0.5f);
-  }
-  else {
-    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
-    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
-    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
-    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
-    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
-    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
-    h = min(-min(min(h0, h1), h2), h * 0.5f);
-  }
-
-  return n * h;
-}
-
-/* Ray offset to avoid shadow terminator artifact. */
-
-ccl_device_inline float3 ray_offset_shadow(KernelGlobals *kg, ShaderData *sd, float3 L)
-{
-  float NL = dot(sd->N, L);
-  bool transmit = (NL < 0.0f);
-  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
-  float3 P = ray_offset(sd->P, Ng);
-
-  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
-    const float offset_cutoff =
-        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
-    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
-     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
-     * make a smooth transition near the threshold. */
-    if (offset_cutoff > 0.0f) {
-      float NgL = dot(Ng, L);
-      float offset_amount = 0.0f;
-      if (NL < offset_cutoff) {
-        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
-      }
-      else {
-        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
-      }
-      if (offset_amount > 0.0f) {
-        P += smooth_surface_offset(kg, sd, Ng) * offset_amount;
-      }
-    }
-  }
-
-  return P;
-}
-
 #if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
@@ -193,10 +113,10 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
 }
 #endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
-/* Utility to quickly get a shader flags from an intersection. */
+/* Utility to quickly get flags from an intersection. */
 
-ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_restrict kg,
-                                                         const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
 {
   const int prim = kernel_tex_fetch(__prim_index, isect->prim);
   int shader = 0;
@@ -217,14 +137,14 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_rest
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
 }
 
-ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict kg,
-                                                   const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_from_isect_prim(
+    const KernelGlobals *ccl_restrict kg, const int isect_prim)
 {
-  const int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  const int prim = kernel_tex_fetch(__prim_index, isect_prim);
   int shader = 0;
 
 #ifdef __HAIR__
-  if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE)
+  if (kernel_tex_fetch(__prim_type, isect_prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
   {
     shader = kernel_tex_fetch(__tri_shader, prim);
@@ -239,7 +159,13 @@ ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict k
   return shader & SHADER_MASK;
 }
 
-ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict kg,
+ccl_device_forceinline int intersection_get_shader(const KernelGlobals *ccl_restrict kg,
+                                                   const Intersection *ccl_restrict isect)
+{
+  return intersection_get_shader_from_isect_prim(kg, isect->prim);
+}
+
+ccl_device_forceinline int intersection_get_object(const KernelGlobals *ccl_restrict kg,
                                                    const Intersection *ccl_restrict isect)
 {
   if (isect->object != OBJECT_NONE) {
@@ -249,4 +175,12 @@ ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict k
   return kernel_tex_fetch(__prim_object, isect->prim);
 }
 
+ccl_device_forceinline int intersection_get_object_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
+{
+  const int object = intersection_get_object(kg, isect);
+
+  return kernel_tex_fetch(__object_flag, object);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 1f2ea47269b..0411d9c522d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect,
                                      const uint visibility)
@@ -147,7 +147,7 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+                triangle_intersect(kg, isect, P, dir, isect->t, visibility, object, prim_addr);
               }
               break;
             }
@@ -165,7 +165,7 @@ ccl_device_inline
                   continue;
                 }
                 motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr);
               }
               break;
             }
@@ -181,10 +181,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+            isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+            isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             ++stack_ptr;
@@ -222,7 +221,7 @@ ccl_device_inline
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index a8664cc4331..4874270f15d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    uint BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint max_hits,
@@ -150,7 +150,8 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect_array, P, dir, isect_t, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -190,7 +191,7 @@ ccl_device_inline
                   continue;
                 }
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect_array, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -228,10 +229,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+            isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+            isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             num_hits_in_instance = 0;
@@ -289,7 +289,7 @@ ccl_device_inline
   return num_hits;
 }
 
-ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline uint BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint max_hits,
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index 99a5a675976..72a8c2ba090 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 6f2f2ebb202..4eb8bcae997 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/closure/bsdf_ashikhmin_velvet.h"
 #include "kernel/closure/bsdf_diffuse.h"
@@ -109,7 +111,7 @@ ccl_device_inline float shift_cos_in(float cos_in, const float frequency_multipl
   return val;
 }
 
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_inline int bsdf_sample(const KernelGlobals *kg,
                                   ShaderData *sd,
                                   const ShaderClosure *sc,
                                   float randu,
@@ -429,21 +431,6 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-#endif
     default:
       label = LABEL_NONE;
       break;
@@ -482,15 +469,16 @@ ccl_device
 ccl_device_inline
 #endif
     float3
-    bsdf_eval(KernelGlobals *kg,
+    bsdf_eval(const KernelGlobals *kg,
               ShaderData *sd,
               const ShaderClosure *sc,
               const float3 omega_in,
+              const bool is_transmission,
               float *pdf)
 {
-  float3 eval;
+  float3 eval = zero_float3();
 
-  if (dot(sd->N, omega_in) >= 0.0f) {
+  if (!is_transmission) {
     switch (sc->type) {
       case CLOSURE_BSDF_DIFFUSE_ID:
       case CLOSURE_BSDF_BSSRDF_ID:
@@ -570,13 +558,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -663,13 +645,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -682,7 +658,7 @@ ccl_device_inline
   return eval;
 }
 
-ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
+ccl_device void bsdf_blur(const KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
   /* ToDo: do we want to blur volume closures? */
 #ifdef __SVM__
@@ -715,55 +691,4 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #endif
 }
 
-ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
-{
-#ifdef __SVM__
-  switch (a->type) {
-    case CLOSURE_BSDF_TRANSPARENT_ID:
-      return true;
-    case CLOSURE_BSDF_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_ID:
-    case CLOSURE_BSDF_TRANSLUCENT_ID:
-      return bsdf_diffuse_merge(a, b);
-    case CLOSURE_BSDF_OREN_NAYAR_ID:
-      return bsdf_oren_nayar_merge(a, b);
-    case CLOSURE_BSDF_REFLECTION_ID:
-    case CLOSURE_BSDF_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      return bsdf_microfacet_merge(a, b);
-    case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-      return bsdf_ashikhmin_velvet_merge(a, b);
-    case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-    case CLOSURE_BSDF_GLOSSY_TOON_ID:
-      return bsdf_toon_merge(a, b);
-    case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-    case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-      return bsdf_hair_merge(a, b);
-#  ifdef __PRINCIPLED__
-    case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-      return bsdf_principled_diffuse_merge(a, b);
-#  endif
-#  ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      return volume_henyey_greenstein_merge(a, b);
-#  endif
-    default:
-      return false;
-  }
-#else
-  return false;
-#endif
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 9814a7cf5c9..be6383e521a 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -14,20 +14,19 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
-#define __BSDF_ASHIKHMIN_SHIRLEY_H__
-
 /*
-ASHIKHMIN SHIRLEY BSDF
-
-Implementation of
-Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
-
-The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
-the case with all other microfacet-based BSDF implementations in Cycles.
+ * ASHIKHMIN SHIRLEY BSDF
+ *
+ * Implementation of
+ * Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
+ *
+ * The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
+ * the case with all other microfacet-based BSDF implementations in Cycles.
+ *
+ * Other than that, the implementation directly follows the paper.
+ */
 
-Other than that, the implementation directly follows the paper.
-*/
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -240,5 +239,3 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3d3f20edab3..f51027f5701 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -30,8 +30,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_ASHIKHMIN_VELVET_H__
-#define __BSDF_ASHIKHMIN_VELVET_H__
+#pragma once
+
+#include "kernel/kernel_montecarlo.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -54,14 +55,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_ashikhmin_velvet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const VelvetBsdf *bsdf_a = (const VelvetBsdf *)a;
-  const VelvetBsdf *bsdf_b = (const VelvetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->sigma == bsdf_b->sigma);
-}
-
 ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc,
                                                      const float3 I,
                                                      const float3 omega_in,
@@ -175,5 +168,3 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index ea604ed0311..1555aa30304 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_H__
-#define __BSDF_DIFFUSE_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -49,14 +48,6 @@ ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const DiffuseBsdf *bsdf_a = (const DiffuseBsdf *)a;
-  const DiffuseBsdf *bsdf_b = (const DiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N));
-}
-
 ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc,
                                             const float3 I,
                                             const float3 omega_in,
@@ -174,5 +165,3 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index aa62c1c7ceb..b06dd196b9e 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_RAMP_H__
-#define __BSDF_DIFFUSE_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -125,5 +124,3 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 7ca9424b815..f56f78aa1f0 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_HAIR_H__
-#define __BSDF_HAIR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,15 +61,6 @@ ccl_device int bsdf_hair_transmission_setup(HairBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_hair_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HairBsdf *bsdf_a = (const HairBsdf *)a;
-  const HairBsdf *bsdf_b = (const HairBsdf *)b;
-
-  return (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->roughness1 == bsdf_b->roughness1) &&
-         (bsdf_a->roughness2 == bsdf_b->roughness2) && (bsdf_a->offset == bsdf_b->offset);
-}
-
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc,
                                                     const float3 I,
                                                     const float3 omega_in,
@@ -309,5 +299,3 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index f12661b3095..bfe56e5ab0e 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #ifdef __KERNEL_CPU__
 #  include <fenv.h>
 #endif
 
 #include "kernel/kernel_color.h"
 
-#ifndef __BSDF_HAIR_PRINCIPLED_H__
-#  define __BSDF_HAIR_PRINCIPLED_H__
-
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledHairExtra {
@@ -181,12 +180,12 @@ ccl_device_inline float longitudinal_scattering(
 }
 
 /* Combine the three values using their luminances. */
-ccl_device_inline float4 combine_with_energy(KernelGlobals *kg, float3 c)
+ccl_device_inline float4 combine_with_energy(const KernelGlobals *kg, float3 c)
 {
   return make_float4(c.x, c.y, c.z, linear_rgb_to_gray(kg, c));
 }
 
-#  ifdef __HAIR__
+#ifdef __HAIR__
 /* Set up the hair closure. */
 ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bsdf)
 {
@@ -226,10 +225,10 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
 
-#  endif /* __HAIR__ */
+#endif /* __HAIR__ */
 
 /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
-ccl_device_inline void hair_attenuation(KernelGlobals *kg, float f, float3 T, float4 *Ap)
+ccl_device_inline void hair_attenuation(const KernelGlobals *kg, float f, float3 T, float4 *Ap)
 {
   /* Primary specular (R). */
   Ap[0] = make_float4(f, f, f, f);
@@ -278,7 +277,7 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i,
 }
 
 /* Evaluation function for our shader. */
-ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
+ccl_device float3 bsdf_principled_hair_eval(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const ShaderClosure *sc,
                                             const float3 omega_in,
@@ -356,7 +355,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
 }
 
 /* Sampling function for the hair shader. */
-ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
+ccl_device int bsdf_principled_hair_sample(const KernelGlobals *kg,
                                            const ShaderClosure *sc,
                                            ShaderData *sd,
                                            float randu,
@@ -473,11 +472,11 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
 
   *omega_in = X * sin_theta_i + Y * cos_theta_i * cosf(phi_i) + Z * cos_theta_i * sinf(phi_i);
 
-#  ifdef __RAY_DIFFERENTIALS__
+#ifdef __RAY_DIFFERENTIALS__
   float3 N = safe_normalize(sd->I + *omega_in);
   *domega_in_dx = (2 * dot(N, sd->dI.dx)) * N - sd->dI.dx;
   *domega_in_dy = (2 * dot(N, sd->dI.dy)) * N - sd->dI.dy;
-#  endif
+#endif
 
   return LABEL_GLOSSY | ((p == 0) ? LABEL_REFLECT : LABEL_TRANSMIT);
 }
@@ -501,7 +500,7 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
   return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }
 
-ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc)
+ccl_device float3 bsdf_principled_hair_albedo(const ShaderClosure *sc)
 {
   PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc;
   return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
@@ -523,5 +522,3 @@ ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const flo
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index af03bab39f7..227cb448b47 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -30,8 +30,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_MICROFACET_H__
-#define __BSDF_MICROFACET_H__
+#pragma once
+
+#include "kernel/kernel_lookup_table.h"
+#include "kernel/kernel_random.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,7 +55,7 @@ static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf i
 
 /* Beckmann and GGX microfacet importance sampling. */
 
-ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals *kg,
+ccl_device_inline void microfacet_beckmann_sample_slopes(const KernelGlobals *kg,
                                                          const float cos_theta_i,
                                                          const float sin_theta_i,
                                                          float randu,
@@ -193,7 +195,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i,
   *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x));
 }
 
-ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals *kg,
+ccl_device_forceinline float3 microfacet_sample_stretched(const KernelGlobals *kg,
                                                           const float3 omega_i,
                                                           const float alpha_x,
                                                           const float alpha_y,
@@ -352,21 +354,6 @@ ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const S
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf *)a;
-  const MicrofacetBsdf *bsdf_b = (const MicrofacetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->alpha_x == bsdf_b->alpha_x) &&
-         (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
-         (bsdf_a->ior == bsdf_b->ior) &&
-         ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
-          ((bsdf_a->extra && bsdf_b->extra) &&
-           (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) &&
-           (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) &&
-           (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat)));
-}
-
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->extra = NULL;
@@ -558,7 +545,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc,
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_ggx_sample(const KernelGlobals *kg,
                                           const ShaderClosure *sc,
                                           float3 Ng,
                                           float3 I,
@@ -986,7 +973,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_beckmann_sample(const KernelGlobals *kg,
                                                const ShaderClosure *sc,
                                                float3 Ng,
                                                float3 I,
@@ -1175,5 +1162,3 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_MICROFACET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 9795c8da065..68d5071dbce 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Most of the code is based on the supplemental implementations from
@@ -466,7 +468,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
                         bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_sample(const KernelGlobals *kg,
                                                 const ShaderClosure *sc,
                                                 float3 Ng,
                                                 float3 I,
@@ -628,7 +630,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
                        bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_glass_sample(const KernelGlobals *kg,
                                                       const ShaderClosure *sc,
                                                       float3 Ng,
                                                       float3 I,
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 41e5736bf49..be12d47f0ea 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_OREN_NAYAR_H__
-#define __BSDF_OREN_NAYAR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -61,14 +60,6 @@ ccl_device int bsdf_oren_nayar_setup(OrenNayarBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_oren_nayar_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const OrenNayarBsdf *bsdf_a = (const OrenNayarBsdf *)a;
-  const OrenNayarBsdf *bsdf_b = (const OrenNayarBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc,
                                                const float3 I,
                                                const float3 omega_in,
@@ -127,5 +118,3 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_OREN_NAYAR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index cf5484383f2..43f8cf71c59 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_PHONG_RAMP_H__
-#define __BSDF_PHONG_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -153,5 +152,3 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PHONG_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index d5d012068ff..a72af519482 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
-#define __BSDF_PRINCIPLED_DIFFUSE_H__
+#pragma once
 
 /* DISNEY PRINCIPLED DIFFUSE BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledDiffuseBsdf {
@@ -61,14 +62,6 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf *)a;
-  const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc,
                                                        const float3 I,
                                                        const float3 omega_in,
@@ -136,5 +129,3 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 3707de29d73..60ce7e4eb75 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_SHEEN_H__
-#define __BSDF_PRINCIPLED_SHEEN_H__
+#pragma once
 
 /* DISNEY PRINCIPLED SHEEN BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledSheenBsdf {
@@ -137,5 +138,3 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index c24ba170915..31283971d5a 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFLECTION_H__
-#define __BSDF_REFLECTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -93,5 +92,3 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFLECTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index d4fbe86dac0..cfedb5dfe2c 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFRACTION_H__
-#define __BSDF_REFRACTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -111,5 +110,3 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFRACTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index cc5de21ed0e..acdafe0f735 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TOON_H__
-#define __BSDF_TOON_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -55,15 +54,6 @@ ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_toon_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const ToonBsdf *bsdf_a = (const ToonBsdf *)a;
-  const ToonBsdf *bsdf_b = (const ToonBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->size == bsdf_b->size) &&
-         (bsdf_a->smooth == bsdf_b->smooth);
-}
-
 ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
   float is;
@@ -248,5 +238,3 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TOON_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 4e5513499e8..f1dc7efb345 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TRANSPARENT_H__
-#define __BSDF_TRANSPARENT_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -123,5 +122,3 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TRANSPARENT_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index a73dee1b045..beec5f768a1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_UTIL_H__
-#define __BSDF_UTIL_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -150,5 +149,3 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 562daf1286d..0f9278bba89 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_BSSRDF_H__
-#define __KERNEL_BSSRDF_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,310 +23,71 @@ typedef ccl_addr_space struct Bssrdf {
 
   float3 radius;
   float3 albedo;
-  float sharpness;
-  float texture_blur;
   float roughness;
-  float channels;
+  float anisotropy;
 } Bssrdf;
 
 static_assert(sizeof(ShaderClosure) >= sizeof(Bssrdf), "Bssrdf is too large!");
 
-/* Planar Truncated Gaussian
- *
- * Note how this is different from the typical gaussian, this one integrates
- * to 1 over the plane (where you get an extra 2*pi*x factor). We are lucky
- * that integrating x*exp(-x) gives a nice closed form solution. */
-
-/* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
-#define GAUSS_TRUNCATE 12.46f
-
-ccl_device float bssrdf_gaussian_eval(const float radius, float r)
-{
-  /* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
-   * = 1 - exp(-Rm*Rm/(2*v)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  if (r >= Rm)
-    return 0.0f;
-
-  return expf(-r * r / (2.0f * v)) / (2.0f * M_PI_F * v);
-}
-
-ccl_device float bssrdf_gaussian_pdf(const float radius, float r)
+ccl_device float bssrdf_dipole_compute_Rd(float alpha_prime, float fourthirdA)
 {
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  return bssrdf_gaussian_eval(radius, r) * (1.0f / (area_truncated));
+  float s = sqrtf(3.0f * (1.0f - alpha_prime));
+  return 0.5f * alpha_prime * (1.0f + expf(-fourthirdA * s)) * expf(-s);
 }
 
-ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h)
+ccl_device float bssrdf_dipole_compute_alpha_prime(float rd, float fourthirdA)
 {
-  /* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
-   * r = sqrt(-2*v*logf(xi)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  /* r(xi) */
-  const float r_squared = -2.0f * v * logf(1.0f - xi * area_truncated);
-  *r = sqrtf(r_squared);
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_squared);
-}
-
-/* Planar Cubic BSSRDF falloff
- *
- * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
- * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
- * far as I can tell has no closed form solution. So we get an iterative solution
- * instead with newton-raphson. */
-
-ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r)
-{
-  if (sharpness == 0.0f) {
-    const float Rm = radius;
-
-    if (r >= Rm)
-      return 0.0f;
-
-    /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
-    const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
-    const float f = Rm - r;
-    const float num = f * f * f;
-
-    return (10.0f * num) / (Rm5 * M_PI_F);
+  /* Little Newton solver. */
+  if (rd < 1e-4f) {
+    return 0.0f;
+  }
+  if (rd >= 0.995f) {
+    return 0.999999f;
   }
-  else {
-    float Rm = radius * (1.0f + sharpness);
-
-    if (r >= Rm)
-      return 0.0f;
 
-    /* custom variation with extra sharpness, to match the previous code */
-    const float y = 1.0f / (1.0f + sharpness);
-    float Rmy, ry, ryinv;
+  float x0 = 0.0f;
+  float x1 = 1.0f;
+  float xmid, fmid;
 
-    if (sharpness == 1.0f) {
-      Rmy = sqrtf(Rm);
-      ry = sqrtf(r);
-      ryinv = (ry > 0.0f) ? 1.0f / ry : 0.0f;
+  constexpr const int max_num_iterations = 12;
+  for (int i = 0; i < max_num_iterations; ++i) {
+    xmid = 0.5f * (x0 + x1);
+    fmid = bssrdf_dipole_compute_Rd(xmid, fourthirdA);
+    if (fmid < rd) {
+      x0 = xmid;
     }
     else {
-      Rmy = powf(Rm, y);
-      ry = powf(r, y);
-      ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f;
+      x1 = xmid;
     }
-
-    const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy;
-    const float f = Rmy - ry;
-    const float num = f * (f * f) * (y * ryinv);
-
-    return (10.0f * num) / (Rmy5 * M_PI_F);
-  }
-}
-
-ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r)
-{
-  return bssrdf_cubic_eval(radius, sharpness, r);
-}
-
-/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
-{
-  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
-   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
-   * should not be too bad */
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  float x = 0.25f;
-  int i;
-
-  for (i = 0; i < max_iteration_count; i++) {
-    float x2 = x * x;
-    float x3 = x2 * x;
-    float nx = (1.0f - x);
-
-    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
-    float f_ = 20.0f * (x * nx) * (nx * nx);
-
-    if (fabsf(f) < tolerance || f_ == 0.0f)
-      break;
-
-    x = saturate(x - f / f_);
   }
 
-  return x;
+  return xmid;
 }
 
-ccl_device void bssrdf_cubic_sample(
-    const float radius, const float sharpness, float xi, float *r, float *h)
+ccl_device void bssrdf_setup_radius(Bssrdf *bssrdf, const ClosureType type, const float eta)
 {
-  float Rm = radius;
-  float r_ = bssrdf_cubic_quintic_root_find(xi);
-
-  if (sharpness != 0.0f) {
-    r_ = powf(r_, 1.0f + sharpness);
-    Rm *= (1.0f + sharpness);
-  }
-
-  r_ *= Rm;
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Approximate Reflectance Profiles
- * http://graphics.pixar.com/library/ApproxBSSRDF/paper.pdf
- */
-
-/* This is a bit arbitrary, just need big enough radius so it matches
- * the mean free length, but still not too big so sampling is still
- * effective. Might need some further tweaks.
- */
-#define BURLEY_TRUNCATE 16.0f
-#define BURLEY_TRUNCATE_CDF 0.9963790093708328f  // cdf(BURLEY_TRUNCATE)
-
-ccl_device_inline float bssrdf_burley_fitting(float A)
-{
-  /* Diffuse surface transmission, equation (6). */
-  return 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
-}
-
-/* Scale mean free path length so it gives similar looking result
- * to Cubic and Gaussian models.
- */
-ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
-{
-  return 0.25f * M_1_PI_F * r;
-}
-
-ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf)
-{
-  /* Mean free path length. */
-  const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius);
-  /* Surface albedo. */
-  const float3 A = bssrdf->albedo;
-  const float3 s = make_float3(
-      bssrdf_burley_fitting(A.x), bssrdf_burley_fitting(A.y), bssrdf_burley_fitting(A.z));
-
-  bssrdf->radius = l / s;
-}
-
-ccl_device float bssrdf_burley_eval(const float d, float r)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-
-  if (r >= Rm)
-    return 0.0f;
-
-  /* Burley reflectance profile, equation (3).
-   *
-   * NOTES:
-   * - Surface albedo is already included into sc->weight, no need to
-   *   multiply by this term here.
-   * - This is normalized diffuse model, so the equation is multiplied
-   *   by 2*pi, which also matches cdf().
-   */
-  float exp_r_3_d = expf(-r / (3.0f * d));
-  float exp_r_d = exp_r_3_d * exp_r_3_d * exp_r_3_d;
-  return (exp_r_d + exp_r_3_d) / (4.0f * d);
-}
-
-ccl_device float bssrdf_burley_pdf(const float d, float r)
-{
-  return bssrdf_burley_eval(d, r) * (1.0f / BURLEY_TRUNCATE_CDF);
-}
-
-/* Find the radius for desired CDF value.
- * Returns scaled radius, meaning the result is to be scaled up by d.
- * Since there's no closed form solution we do Newton-Raphson method to find it.
- */
-ccl_device_forceinline float bssrdf_burley_root_find(float xi)
-{
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  /* Do initial guess based on manual curve fitting, this allows us to reduce
-   * number of iterations to maximum 4 across the [0..1] range. We keep maximum
-   * number of iteration higher just to be sure we didn't miss root in some
-   * corner case.
-   */
-  float r;
-  if (xi <= 0.9f) {
-    r = expf(xi * xi * 2.4f) - 1.0f;
+  if (type == CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) {
+    /* Scale mean free path length so it gives similar looking result to older
+     * Cubic, Gaussian and Burley models. */
+    bssrdf->radius *= 0.25f * M_1_PI_F;
   }
   else {
-    /* TODO(sergey): Some nicer curve fit is possible here. */
-    r = 15.0f;
-  }
-  /* Solve against scaled radius. */
-  for (int i = 0; i < max_iteration_count; i++) {
-    float exp_r_3 = expf(-r / 3.0f);
-    float exp_r = exp_r_3 * exp_r_3 * exp_r_3;
-    float f = 1.0f - 0.25f * exp_r - 0.75f * exp_r_3 - xi;
-    float f_ = 0.25f * exp_r + 0.25f * exp_r_3;
+    /* Adjust radius based on IOR and albedo. */
+    const float inv_eta = 1.0f / eta;
+    const float F_dr = inv_eta * (-1.440f * inv_eta + 0.710f) + 0.668f + 0.0636f * eta;
+    const float fourthirdA = (4.0f / 3.0f) * (1.0f + F_dr) /
+                             (1.0f - F_dr); /* From Jensen's Fdr ratio formula. */
 
-    if (fabsf(f) < tolerance || f_ == 0.0f) {
-      break;
-    }
+    const float3 alpha_prime = make_float3(
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.x, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.y, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.z, fourthirdA));
 
-    r = r - f / f_;
-    if (r < 0.0f) {
-      r = 0.0f;
-    }
+    bssrdf->radius *= sqrt(3.0f * (one_float3() - alpha_prime));
   }
-  return r;
 }
 
-ccl_device void bssrdf_burley_sample(const float d, float xi, float *r, float *h)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-  const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* None BSSRDF falloff
- *
- * Samples distributed over disk with no falloff, for reference. */
-
-ccl_device float bssrdf_none_eval(const float radius, float r)
-{
-  const float Rm = radius;
-  return (r < Rm) ? 1.0f : 0.0f;
-}
-
-ccl_device float bssrdf_none_pdf(const float radius, float r)
-{
-  /* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
-  const float Rm = radius;
-  const float area = (M_PI_F * Rm * Rm);
-
-  return bssrdf_none_eval(radius, r) / area;
-}
-
-ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h)
-{
-  /* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
-   * r = sqrt(xi)*Rm */
-  const float Rm = radius;
-  const float r_ = sqrtf(xi) * Rm;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Generic */
+/* Setup */
 
 ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
 {
@@ -342,7 +102,7 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
   return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL;
 }
 
-ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
+ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type, const float ior)
 {
   int flag = 0;
   int bssrdf_channels = 3;
@@ -371,7 +131,7 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   if (bssrdf_channels < 3) {
     /* Add diffuse BSDF if any radius too small. */
 #ifdef __PRINCIPLED__
-    if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
+    if (bssrdf->roughness != FLT_MAX) {
       float roughness = bssrdf->roughness;
       float3 N = bssrdf->N;
 
@@ -401,16 +161,9 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   /* Setup BSSRDF if radius is large enough. */
   if (bssrdf_channels > 0) {
     bssrdf->type = type;
-    bssrdf->channels = bssrdf_channels;
-    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels;
-    bssrdf->texture_blur = saturate(bssrdf->texture_blur);
-    bssrdf->sharpness = saturate(bssrdf->sharpness);
+    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf_channels;
 
-    if (type == CLOSURE_BSSRDF_BURLEY_ID || type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
-        type == CLOSURE_BSSRDF_RANDOM_WALK_ID ||
-        type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-      bssrdf_burley_setup(bssrdf);
-    }
+    bssrdf_setup_radius(bssrdf, type, ior);
 
     flag |= SD_BSSRDF;
   }
@@ -422,77 +175,4 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   return flag;
 }
 
-ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float radius;
-
-  /* Sample color channel and reuse random number. Only a subset of channels
-   * may be used if their radius was too small to handle as BSSRDF. */
-  xi *= bssrdf->channels;
-
-  if (xi < 1.0f) {
-    radius = (bssrdf->radius.x > 0.0f) ? bssrdf->radius.x :
-             (bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                         bssrdf->radius.z;
-  }
-  else if (xi < 2.0f) {
-    xi -= 1.0f;
-    radius = (bssrdf->radius.x > 0.0f && bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                                                    bssrdf->radius.z;
-  }
-  else {
-    xi -= 2.0f;
-    radius = bssrdf->radius.z;
-  }
-
-  /* Sample BSSRDF. */
-  if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    bssrdf_gaussian_sample(radius, xi, r, h);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) */
-    bssrdf_burley_sample(radius, xi, r, h);
-  }
-}
-
-ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r)
-{
-  if (radius == 0.0f) {
-    return 0.0f;
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    return bssrdf_gaussian_pdf(radius, r);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
-    return bssrdf_burley_pdf(radius, r);
-  }
-}
-
-ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-
-  return make_float3(bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r));
-}
-
-ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float3 pdf = bssrdf_eval(sc, r);
-
-  return (pdf.x + pdf.y + pdf.z) / bssrdf->channels;
-}
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_BSSRDF_H__ */
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index 911382e6865..a2519d97618 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -30,6 +30,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* BACKGROUND CLOSURE */
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 1430f712701..69959a3f21b 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __VOLUME_H__
-#define __VOLUME_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,21 +61,12 @@ ccl_device int volume_henyey_greenstein_setup(HenyeyGreensteinVolume *volume)
   return SD_SCATTER;
 }
 
-ccl_device bool volume_henyey_greenstein_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HenyeyGreensteinVolume *volume_a = (const HenyeyGreensteinVolume *)a;
-  const HenyeyGreensteinVolume *volume_b = (const HenyeyGreensteinVolume *)b;
-
-  return (volume_a->g == volume_b->g);
-}
-
-ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc,
+ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderVolumeClosure *svc,
                                                       const float3 I,
                                                       float3 omega_in,
                                                       float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer */
   if (fabsf(g) < 1e-3f) {
@@ -122,7 +112,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pd
   return dir;
 }
 
-ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
+ccl_device int volume_henyey_greenstein_sample(const ShaderVolumeClosure *svc,
                                                float3 I,
                                                float3 dIdx,
                                                float3 dIdy,
@@ -134,8 +124,7 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
                                                float3 *domega_in_dy,
                                                float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer and so is used negated */
   *omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf);
@@ -153,17 +142,15 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
 /* VOLUME CLOSURE */
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd,
-                                    const ShaderClosure *sc,
+                                    const ShaderVolumeClosure *svc,
                                     float3 omega_in,
                                     float *pdf)
 {
-  kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
-
-  return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+  return volume_henyey_greenstein_eval_phase(svc, sd->I, omega_in, pdf);
 }
 
 ccl_device int volume_phase_sample(const ShaderData *sd,
-                                   const ShaderClosure *sc,
+                                   const ShaderVolumeClosure *svc,
                                    float randu,
                                    float randv,
                                    float3 *eval,
@@ -171,31 +158,65 @@ ccl_device int volume_phase_sample(const ShaderData *sd,
                                    differential3 *domega_in,
                                    float *pdf)
 {
-  int label;
-
-  switch (sc->type) {
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-    default:
-      *eval = make_float3(0.0f, 0.0f, 0.0f);
-      label = LABEL_NONE;
-      break;
+  return volume_henyey_greenstein_sample(svc,
+                                         sd->I,
+                                         sd->dI.dx,
+                                         sd->dI.dy,
+                                         randu,
+                                         randv,
+                                         eval,
+                                         omega_in,
+                                         &domega_in->dx,
+                                         &domega_in->dy,
+                                         pdf);
+}
+
+/* Volume sampling utilities. */
+
+/* todo: this value could be tweaked or turned into a probability to avoid
+ * unnecessary work in volumes and subsurface scattering. */
+#define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+ccl_device float3 volume_color_transmittance(float3 sigma, float t)
+{
+  return exp3(-sigma * t);
+}
+
+ccl_device float volume_channel_get(float3 value, int channel)
+{
+  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
+}
+
+ccl_device int volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf)
+{
+  /* Sample color channel proportional to throughput and single scattering
+   * albedo, to significantly reduce noise with many bounce, following:
+   *
+   * "Practical and Controllable Subsurface Scattering for Production Path
+   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+  float3 weights = fabs(throughput * albedo);
+  float sum_weights = weights.x + weights.y + weights.z;
+  float3 weights_pdf;
+
+  if (sum_weights > 0.0f) {
+    weights_pdf = weights / sum_weights;
   }
+  else {
+    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
+  }
+
+  *pdf = weights_pdf;
 
-  return label;
+  /* OpenCL does not support -> on float3, so don't use pdf->x. */
+  if (rand < weights_pdf.x) {
+    return 0;
+  }
+  else if (rand < weights_pdf.x + weights_pdf.y) {
+    return 1;
+  }
+  else {
+    return 2;
+  }
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/device/cpu/compat.h
index 88f6a264a5a..bfd936c7bbd 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CPU_H__
-#define __KERNEL_COMPAT_CPU_H__
+#pragma once
 
 #define __KERNEL_CPU__
 
@@ -27,14 +26,6 @@
 #  pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #include "util/util_half.h"
 #include "util/util_math.h"
 #include "util/util_simd.h"
@@ -43,15 +34,6 @@
 
 #define ccl_addr_space
 
-#define ccl_local_id(d) 0
-#define ccl_global_id(d) (kg->global_id[d])
-
-#define ccl_local_size(d) 1
-#define ccl_global_size(d) (kg->global_size[d])
-
-#define ccl_group_id(d) ccl_global_id(d)
-#define ccl_num_groups(d) ccl_global_size(d)
-
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -72,37 +54,11 @@ CCL_NAMESPACE_BEGIN
  * simple arrays and after inlining fetch hopefully revert to being a simple
  * pointer lookup. */
 template<typename T> struct texture {
-  ccl_always_inline const T &fetch(int index)
+  ccl_always_inline const T &fetch(int index) const
   {
     kernel_assert(index >= 0 && index < width);
     return data[index];
   }
-#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-  /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
-   * compatibility with existing indices and data structures.
-   */
-  ccl_always_inline avxf fetch_avxf(const int index)
-  {
-    kernel_assert(index >= 0 && (index + 1) < width);
-    ssef *ssef_data = (ssef *)data;
-    ssef *ssef_node_data = &ssef_data[index];
-    return _mm256_loadu_ps((float *)ssef_node_data);
-  }
-#endif
-
-#ifdef __KERNEL_SSE2__
-  ccl_always_inline ssef fetch_ssef(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssef *)data)[index];
-  }
-
-  ccl_always_inline ssei fetch_ssei(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssei *)data)[index];
-  }
-#endif
 
   T *data;
   int width;
@@ -110,15 +66,6 @@ template<typename T> struct texture {
 
 /* Macros to handle different memory storage on different devices */
 
-#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
-#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
-#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
-#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-#define kernel_tex_array(tex) (kg->tex.data)
-
-#define kernel_data (kg->__data)
-
 #ifdef __KERNEL_SSE2__
 typedef vector3<sseb> sse3b;
 typedef vector3<ssef> sse3f;
@@ -152,5 +99,3 @@ typedef vector3<avxf> avx3f;
 #endif
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+  KernelData __data;
+
+#ifdef __OSL__
+  /* On the CPU, we also have the OSL globals here. Most data structures are shared
+   * with SVM, the difference is in the shaders and object/mesh attributes. */
+  OSLGlobals *osl;
+  OSLShadingSystem *osl_ss;
+  OSLThreadData *osl_tdata;
+#endif
+
+  /* **** Run-time data ****  */
+
+  ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/device/cpu/image.h
index 59b96c86c50..57e81ab186d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_CPU_IMAGE_H__
-#define __KERNEL_CPU_IMAGE_H__
+#pragma once
 
 #ifdef WITH_NANOVDB
 #  define NANOVDB_USE_INTRINSICS
@@ -584,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
 
 #undef SET_CUBIC_SPLINE_WEIGHTS
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -612,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -656,5 +655,3 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
 } /* Namespace. */
 
 CCL_NAMESPACE_END
-
-#endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 8040bfb7b33..ac1cdf5fffe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -56,9 +56,9 @@
 /* do nothing */
 #endif
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index b907c6a2bac..ae2a841835a 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -14,50 +14,49 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_H__
-#define __KERNEL_H__
+#pragma once
 
 /* CPU Kernel Interface */
 
-#include "kernel/kernel_types.h"
 #include "util/util_types.h"
 
+#include "kernel/kernel_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 #define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
 #define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
+struct IntegratorStateCPU;
 struct KernelGlobals;
 struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
 
-void *kernel_osl_memory(KernelGlobals *kg);
-bool kernel_osl_use(KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
 void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+#    include "kernel/device/cpu/globals.h"
+#    include "kernel/device/cpu/image.h"
+
+#    include "kernel/integrator/integrator_state.h"
+#    include "kernel/integrator/integrator_state_flow.h"
+#    include "kernel/integrator/integrator_state_util.h"
+
+#    include "kernel/integrator/integrator_init_from_camera.h"
+#    include "kernel/integrator/integrator_init_from_bake.h"
+#    include "kernel/integrator/integrator_intersect_closest.h"
+#    include "kernel/integrator/integrator_intersect_shadow.h"
+#    include "kernel/integrator/integrator_intersect_subsurface.h"
+#    include "kernel/integrator/integrator_intersect_volume_stack.h"
+#    include "kernel/integrator/integrator_shade_background.h"
+#    include "kernel/integrator/integrator_shade_light.h"
+#    include "kernel/integrator/integrator_shade_shadow.h"
+#    include "kernel/integrator/integrator_shade_surface.h"
+#    include "kernel/integrator/integrator_shade_volume.h"
+#    include "kernel/integrator/integrator_megakernel.h"
+
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_adaptive_sampling.h"
+#    include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+#  define STUB_ASSERT(arch, name) \
+    assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif   /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+#  define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+#  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state) \
+  { \
+    KERNEL_INVOKE(name, kg, state); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, state, render_buffer); \
+  }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+  kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+  kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+  return false;
+#else
+  return kernel_adaptive_sampling_convergence_check(
+      kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+#    ifdef __BAKING__
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#    endif
+#  endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
index 5f6b6800363..220768036ab 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -34,6 +34,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 97e8fc25140..90c05113cbe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -35,6 +35,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
index 26d7fd4de48..fb85ef5b0d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -29,6 +29,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
index 3f259aa4480..87baf04258a 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -31,6 +31,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
index 68bae8c07c6..bb421d58815 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -32,6 +32,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/device/cuda/compat.h
index ea3b78b7cef..665da43e1a1 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -14,20 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CUDA_H__
-#define __KERNEL_COMPAT_CUDA_H__
+#pragma once
 
 #define __KERNEL_GPU__
 #define __KERNEL_CUDA__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
 #endif
 
 /* Manual definitions so we can compile without CUDA toolkit. */
@@ -38,8 +33,6 @@ typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -47,14 +40,7 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Qualifier wrappers for different names on different devices */
+/* Qualifiers */
 
 #define ccl_device __device__ __inline__
 #if __CUDA_ARCH__ < 500
@@ -68,104 +54,61 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local __shared__
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
-/* TODO(sergey): In theory we might use references with CUDA, however
- * performance impact yet to be investigated.
- */
-#define ccl_ref
 #define ccl_align(n) __align__(n)
 #define ccl_optional_struct_init
 
-#define ATTR_FALLTHROUGH
-
-#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH)
-
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
-/* Types */
+/* GPU thread, block, grid size and index */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
 
-/* Work item functions */
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-ccl_device_inline uint ccl_local_id(uint d)
-{
-  switch (d) {
-    case 0:
-      return threadIdx.x;
-    case 1:
-      return threadIdx.y;
-    case 2:
-      return threadIdx.z;
-    default:
-      return 0;
-  }
-}
+/* GPU warp synchronizaton */
 
-#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
 
-ccl_device_inline uint ccl_local_size(uint d)
-{
-  switch (d) {
-    case 0:
-      return blockDim.x;
-    case 1:
-      return blockDim.y;
-    case 2:
-      return blockDim.z;
-    default:
-      return 0;
-  }
-}
+/* GPU texture objects */
 
-#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
 
-ccl_device_inline uint ccl_group_id(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
 {
-  switch (d) {
-    case 0:
-      return blockIdx.x;
-    case 1:
-      return blockIdx.y;
-    case 2:
-      return blockIdx.z;
-    default:
-      return 0;
-  }
+  return tex2D<T>(texobj, x, y);
 }
 
-ccl_device_inline uint ccl_num_groups(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
 {
-  switch (d) {
-    case 0:
-      return gridDim.x;
-    case 1:
-      return gridDim.y;
-    case 2:
-      return gridDim.z;
-    default:
-      return 0;
-  }
+  return tex3D<T>(texobj, x, y, z);
 }
 
-/* Textures */
-
-/* Use arrays for regular data. */
-#define kernel_tex_fetch(t, index) t[(index)]
-#define kernel_tex_array(t) (t)
-
-#define kernel_data __data
-
 /* Use fast math functions */
 
 #define cosf(x) __cosf(((float)(x)))
@@ -175,4 +118,18 @@ ccl_device_inline uint ccl_num_groups(uint d)
 #define logf(x) __logf(((float)(x)))
 #define expf(x) __expf(((float)(x)))
 
-#endif /* __KERNEL_COMPAT_CUDA_H__ */
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+#    define GPU_KERNEL_MAX_REGISTERS 64
+#  else
+#    define GPU_KERNEL_MAX_REGISTERS 48
+#  endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 512
+#  define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..169047175f5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/device/cuda/kernel.cu
index 84938b889e5..e26fe243642 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2013 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
+/* CUDA kernel entry points */
 
-#define KERNEL_NAME indirect_subsurface
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#ifdef __CUDA_ARCH__
 
+#  include "kernel/device/cuda/compat.h"
+#  include "kernel/device/cuda/config.h"
+#  include "kernel/device/cuda/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/device/gpu/image.h
index 132653fa7ca..b015c78a8f5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
 #ifdef WITH_NANOVDB
 #  define NDEBUG /* Disable "assert" in device code */
 #  define NANOVDB_USE_INTRINSICS
@@ -61,9 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -81,15 +85,18 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f
   float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
   float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
 
-  return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) +
-         cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1));
+  return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+         cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
 }
 
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -117,10 +124,14 @@ ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x,
   float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
   float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
 
-  return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) +
-         g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1)));
+  return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+         g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
 }
 
 #ifdef WITH_NANOVDB
@@ -157,7 +168,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 }
 
 template<typename T>
-ccl_device_inline T kernel_tex_image_interp_nanovdb(
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
     const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
@@ -178,7 +189,7 @@ ccl_device_inline T kernel_tex_image_interp_nanovdb(
 }
 #endif
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -190,8 +201,8 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       return kernel_tex_image_interp_bicubic<float4>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex2D<float4>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
     }
   }
   /* float, byte and half */
@@ -202,15 +213,15 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       f = kernel_tex_image_interp_bicubic<float>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex2D<float>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -245,8 +256,8 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex3D<float4>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
     }
   }
   else {
@@ -256,10 +267,12 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex3D<float>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..7b79c0aedfa
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_reset(int num_states)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  if (state < num_states) {
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+                                           const int num_tiles,
+                                           float *render_buffer,
+                                           const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+                                         const int num_tiles,
+                                         float *render_buffer,
+                                         const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_closest(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_shadow(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_subsurface(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_volume_stack(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_background(const int *path_index_array,
+                                           float *render_buffer,
+                                           const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_background(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_light(const int *path_index_array,
+                                      float *render_buffer,
+                                      const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_light(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_shadow(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface(const int *path_index_array,
+                                        float *render_buffer,
+                                        const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+                                                 float *render_buffer,
+                                                 const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_volume(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_volume(NULL, state, render_buffer);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_paths_array(int num_states,
+                                             int *indices,
+                                             int *num_indices,
+                                             int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+                                                    int *indices,
+                                                    int *num_indices,
+                                                    int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_paths_array(int num_states,
+                                                 int *indices,
+                                                 int *num_indices,
+                                                 int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_sorted_paths_array(
+        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+{
+  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(path, shader_sort_key) :
+                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_paths_array(int num_states,
+                                              int *indices,
+                                              int *num_indices,
+                                              int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+                                         const int active_states_offset,
+                                         const int terminated_states_offset,
+                                         const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_state_move(to_state, from_state);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_prefix_sum(int *values, int num_values)
+{
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+                                                   int sx,
+                                                   int sy,
+                                                   int sw,
+                                                   int sh,
+                                                   float threshold,
+                                                   bool reset,
+                                                   int offset,
+                                                   int stride,
+                                                   uint *num_active_pixels)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / sw;
+  const int x = work_index - y * sw;
+
+  bool converged = true;
+
+  if (x < sw && y < sh) {
+    converged = kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_x(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int y = ccl_gpu_global_id_x();
+
+  if (y < sh) {
+    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_y(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int x = ccl_gpu_global_id_x();
+
+  if (x < sw) {
+    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+  const int pixel_index = ccl_gpu_global_id_x();
+
+  if (pixel_index < num_pixels) {
+    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+                                                      float *pixels,
+                                                      float *render_buffer,
+                                                      int num_pixels,
+                                                      int width,
+                                                      int offset,
+                                                      int stride,
+                                                      int dst_offset,
+                                                      int dst_stride,
+                                                      const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  ccl_global float *pixel = pixels +
+                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+  processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+
+  float pixel[4];
+  processor(kfilm_convert, buffer, pixel);
+
+  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        processor(kfilm_convert, buffer, pixel_rgba);
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        float value;
+        processor(kfilm_convert, buffer, &value);
+
+        pixel_rgba[0] = value;
+        pixel_rgba[1] = value;
+        pixel_rgba[2] = value;
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+  (const KernelFilmConvert kfilm_convert, \
+   float *pixels, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_common(&kfilm_convert, \
+                                   pixels, \
+                                   render_buffer, \
+                                   num_pixels, \
+                                   width, \
+                                   offset, \
+                                   stride, \
+                                   rgba_offset, \
+                                   rgba_stride, \
+                                   film_get_pass_pixel_##variant); \
+  } \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+  (const KernelFilmConvert kfilm_convert, \
+   uchar4 *rgba, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+                                                        rgba, \
+                                                        render_buffer, \
+                                                        num_pixels, \
+                                                        width, \
+                                                        offset, \
+                                                        stride, \
+                                                        rgba_offset, \
+                                                        rgba_stride, \
+                                                        film_get_pass_pixel_##variant); \
+  }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+                                    float4 *output,
+                                    const int offset,
+                                    const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_displace_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* Background Shader Evaluation */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+                                      float4 *output,
+                                      const int offset,
+                                      const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_background_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_preprocess(float *render_buffer,
+                                       int full_x,
+                                       int full_y,
+                                       int width,
+                                       int height,
+                                       int offset,
+                                       int stride,
+                                       int pass_stride,
+                                       int pass_denoised)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float *color_out = buffer + pass_denoised;
+  color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+  color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+  color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+                                         int guiding_pass_stride,
+                                         int guiding_pass_albedo,
+                                         int guiding_pass_normal,
+                                         const float *render_buffer,
+                                         int render_offset,
+                                         int render_stride,
+                                         int render_pass_stride,
+                                         int render_pass_sample_count,
+                                         int render_pass_denoising_albedo,
+                                         int render_pass_denoising_normal,
+                                         int full_x,
+                                         int full_y,
+                                         int width,
+                                         int height,
+                                         int num_samples)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+  float pixel_scale;
+  if (render_pass_sample_count == PASS_UNUSED) {
+    pixel_scale = 1.0f / num_samples;
+  }
+  else {
+    pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+  }
+
+  /* Albedo pass. */
+  if (guiding_pass_albedo != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+    const float *aledo_in = buffer + render_pass_denoising_albedo;
+    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+    albedo_out[0] = aledo_in[0] * pixel_scale;
+    albedo_out[1] = aledo_in[1] * pixel_scale;
+    albedo_out[2] = aledo_in[2] * pixel_scale;
+  }
+
+  /* Normal pass. */
+  if (render_pass_denoising_normal != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+    const float *normal_in = buffer + render_pass_denoising_normal;
+    float *normal_out = guiding_pixel + guiding_pass_normal;
+
+    normal_out[0] = normal_in[0] * pixel_scale;
+    normal_out[1] = normal_in[1] * pixel_scale;
+    normal_out[2] = normal_in[2] * pixel_scale;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+                                              int guiding_pass_stride,
+                                              int guiding_pass_albedo,
+                                              int width,
+                                              int height)
+{
+  kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+  albedo_out[0] = 0.5f;
+  albedo_out[1] = 0.5f;
+  albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_postprocess(float *render_buffer,
+                                        int full_x,
+                                        int full_y,
+                                        int width,
+                                        int height,
+                                        int offset,
+                                        int stride,
+                                        int pass_stride,
+                                        int num_samples,
+                                        int pass_noisy,
+                                        int pass_denoised,
+                                        int pass_sample_count,
+                                        int num_components,
+                                        bool use_compositing)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float pixel_scale;
+  if (pass_sample_count == PASS_UNUSED) {
+    pixel_scale = num_samples;
+  }
+  else {
+    pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+  }
+
+  float *denoised_pixel = buffer + pass_denoised;
+
+  denoised_pixel[0] *= pixel_scale;
+  denoised_pixel[1] *= pixel_scale;
+  denoised_pixel[2] *= pixel_scale;
+
+  if (num_components == 3) {
+    /* Pass without alpha channel. */
+  }
+  else if (!use_compositing) {
+    /* Currently compositing passes are either 3-component (derived by dividing light passes)
+     * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+     * simplifies logic and avoids extra memory allocation. */
+    const float *noisy_pixel = buffer + pass_noisy;
+    denoised_pixel[3] = noisy_pixel[3];
+  }
+  else {
+    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+     * is an opaque pixel for 4 component passes. */
+
+    denoised_pixel[3] = 0;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+                                                               uint *num_possible_splits)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  bool can_split = false;
+
+  if (state < num_states) {
+    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+  }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..85500bf4d07
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                IsActiveOp is_active_op)
+{
+  extern ccl_gpu_shared int warp_offset[];
+
+  const uint thread_index = ccl_gpu_thread_idx_x;
+  const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+  const uint warp_index = thread_index / ccl_gpu_warp_size;
+  const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+  /* Test if state corresponding to this thread is active. */
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+   * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
+    }
+
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..f609520b8b4
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+{
+  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+    return;
+  }
+
+  int offset = 0;
+  for (int i = 0; i < num_values; i++) {
+    const int new_offset = offset + values[i];
+    values[i] = offset;
+    offset = new_offset;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..65b1990dbb8
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially.  This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+    const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+  extern ccl_gpu_shared OutputT shared_data[];
+
+  const uint tid = ccl_gpu_thread_idx_x;
+  const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+  OutputT sum = zero;
+  for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+    sum += convert(input_data[i]);
+  }
+  shared_data[tid] = sum;
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 512 && tid < 256) {
+    shared_data[tid] = sum = sum + shared_data[tid + 256];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 256 && tid < 128) {
+    shared_data[tid] = sum = sum + shared_data[tid + 128];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 128 && tid < 64) {
+    shared_data[tid] = sum = sum + shared_data[tid + 64];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 64 && tid < 32) {
+    shared_data[tid] = sum = sum + shared_data[tid + 32];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (tid < 32) {
+    for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+      sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+    }
+  }
+
+  if (tid == 0) {
+    output_sum[ccl_gpu_block_idx_x] = sum;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..99b35468517
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                int *key_prefix_sum,
+                                                GetKeyOp get_key_op)
+{
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+  const int key = (state_index < num_states) ? get_key_op(state_index) :
+                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+  if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+    const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+    indices[index] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/device/optix/compat.h
index 064c99ca100..4e255a135c6 100644
--- a/intern/cycles/kernel/kernel_compat_optix.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -15,14 +15,13 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_OPTIX_H__
-#define __KERNEL_COMPAT_OPTIX_H__
+#pragma once
 
 #define OPTIX_DONT_INCLUDE_CUDA
 #include <optix.h>
 
 #define __KERNEL_GPU__
-#define __KERNEL_CUDA__  // OptiX kernels are implicitly CUDA kernels too
+#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
@@ -31,14 +30,14 @@
 #  define ATTR_FALLTHROUGH
 #endif
 
+/* Manual definitions so we can compile without CUDA toolkit. */
+
 #ifdef __CUDACC_RTC__
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -46,21 +45,6 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #define ccl_device \
   __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
@@ -69,29 +53,75 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
-#define ccl_loop_no_unroll
 #define ccl_restrict __restrict__
-#define ccl_ref
+#define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
 
-// Zero initialize structs to help the compiler figure out scoping
+/* Zero initialize structs to help the compiler figure out scoping */
 #define ccl_optional_struct_init = {}
 
-#define kernel_data __params.data  // See kernel_globals.h
-#define kernel_tex_array(t) __params.t
-#define kernel_tex_fetch(t, index) __params.t[(index)]
+/* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronizaton */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
 #include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPTIX_H__ */
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
new file mode 100644
index 00000000000..7d898ed5d91
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Launch parameters */
+struct KernelParamsOptiX {
+  /* Kernel arguments */
+  const int *path_index_array;
+  float *render_buffer;
+
+  /* Global scene data and textures */
+  KernelData data;
+#define KERNEL_TEX(type, name) const type *name;
+#include "kernel/kernel_textures.h"
+
+  /* Integrator state */
+  IntegratorStateGPU __integrator_state;
+};
+
+#ifdef __NVCC__
+extern "C" static __constant__ KernelParamsOptiX __params;
+#endif
+
+/* Abstraction macros */
+#define kernel_data __params.data
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+#define kernel_integrator_state __params.__integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7f609eab474..c1e36febfc0 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -16,14 +16,20 @@
  */
 
 // clang-format off
-#include "kernel/kernel_compat_optix.h"
-#include "util/util_atomic.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "../cuda/kernel_cuda_image.h"  // Texture lookup uses normal CUDA intrinsics
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_bake.h"
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
 // clang-format on
 
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
@@ -53,52 +59,36 @@ template<bool always = false> ccl_device_forceinline uint get_object_id()
     return OBJECT_NONE;
 }
 
-extern "C" __global__ void __raygen__kernel_optix_path_trace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
-  KernelGlobals kg;  // Allocate stack storage for common data
-
-  const uint3 launch_index = optixGetLaunchIndex();
-  // Keep threads for same pixel together to improve occupancy of warps
-  uint pixel_offset = launch_index.x / __params.tile.num_samples;
-  uint sample_offset = launch_index.x % __params.tile.num_samples;
-
-  kernel_path_trace(&kg,
-                    __params.tile.buffer,
-                    __params.tile.start_sample + sample_offset,
-                    __params.tile.x + pixel_offset,
-                    __params.tile.y + launch_index.y,
-                    __params.tile.offset,
-                    __params.tile.stride);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_closest(nullptr, path_index);
 }
 
-#ifdef __BAKING__
-extern "C" __global__ void __raygen__kernel_optix_bake()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_bake_evaluate(&kg,
-                       p.input,
-                       p.output,
-                       (ShaderEvalType)p.type,
-                       p.filter,
-                       p.sx + optixGetLaunchIndex().x,
-                       p.offset,
-                       p.sample);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_shadow(nullptr, path_index);
 }
-#endif
 
-extern "C" __global__ void __raygen__kernel_optix_displace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_subsurface(nullptr, path_index);
 }
 
-extern "C" __global__ void __raygen__kernel_optix_background()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_volume_stack(nullptr, path_index);
 }
 
 extern "C" __global__ void __miss__kernel_optix_miss()
@@ -179,54 +169,91 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 {
 #ifdef __SHADOW_RECORD_ALL__
+  bool ignore_intersection = false;
+
   const uint prim = optixGetPrimitiveIndex();
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
   if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
-    return optixIgnoreIntersection();
+    ignore_intersection = true;
   }
 #  endif
 
-  // Offset into array with num_hits
-  Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2();
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = get_object_id();
-  isect->type = kernel_tex_fetch(__prim_type, prim);
-
+  float u = 0.0f, v = 0.0f;
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
-    isect->u = 1.0f - barycentrics.y - barycentrics.x;
-    isect->v = barycentrics.x;
+    u = 1.0f - barycentrics.y - barycentrics.x;
+    v = barycentrics.x;
   }
 #  ifdef __HAIR__
   else {
-    const float u = __uint_as_float(optixGetAttribute_0());
-    isect->u = u;
-    isect->v = __uint_as_float(optixGetAttribute_1());
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
 
     // Filter out curve endcaps
     if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
+      ignore_intersection = true;
     }
   }
 #  endif
 
+  int num_hits = optixGetPayload_2();
+  int record_index = num_hits;
+  const int max_hits = optixGetPayload_3();
+
+  if (!ignore_intersection) {
+    optixSetPayload_2(num_hits + 1);
+  }
+
+  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+
 #  ifdef __TRANSPARENT_SHADOWS__
-  // Detect if this surface has a shader with transparent shadows
-  if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) {
+  if (num_hits >= max_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    const int num_recorded_hits = min(max_hits, num_hits);
+    float max_recorded_t = isect_array[0].t;
+    int max_recorded_hit = 0;
+
+    for (int i = 1; i < num_recorded_hits; i++) {
+      if (isect_array[i].t > max_recorded_t) {
+        max_recorded_t = isect_array[i].t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
+       * hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
 #  endif
-    // This is an opaque hit or the hit limit has been reached, abort traversal
-    optixSetPayload_5(true);
-    return optixTerminateRay();
+
+  if (!ignore_intersection) {
+    Intersection *const isect = isect_array + record_index;
+    isect->u = u;
+    isect->v = v;
+    isect->t = optixGetRayTmax();
+    isect->prim = prim;
+    isect->object = get_object_id();
+    isect->type = kernel_tex_fetch(__prim_type, prim);
+
+#  ifdef __TRANSPARENT_SHADOWS__
+    // Detect if this surface has a shader with transparent shadows
+    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
+#  endif
+      // If no transparent shadows, all light is blocked and we can stop immediately
+      optixSetPayload_5(true);
+      return optixTerminateRay();
 #  ifdef __TRANSPARENT_SHADOWS__
+    }
+#  endif
   }
 
-  optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++
-
   // Continue tracing
   optixIgnoreIntersection();
-#  endif
 #endif
 }
 
@@ -300,7 +327,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
-  if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
                             __float_as_int(isect.u),   // Attribute_0
@@ -317,11 +344,4 @@ extern "C" __global__ void __intersection__curve_ribbon()
     optix_intersection_curve(prim, type);
   }
 }
-
-extern "C" __global__ void __intersection__curve_all()
-{
-  const uint prim = optixGetPrimitiveIndex();
-  const uint type = kernel_tex_fetch(__prim_type, prim);
-  optix_intersection_curve(prim, type);
-}
 #endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
new file mode 100644
index 00000000000..bf787e29eaa
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2021, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copy of the regular kernels with additional shader ray-tracing kernel that takes
+ * much longer to compiler. This is only loaded when needed by the scene. */
+
+#include "kernel/device/optix/kernel.cu"
+#include "kernel/integrator/integrator_shade_surface.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+}
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
deleted file mode 100644
index b067e53a8bf..00000000000
--- a/intern/cycles/kernel/filter/filter.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_H__
-#define __FILTER_H__
-
-/* CPU Filter Kernel Interface */
-
-#include "util/util_types.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
-#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
-#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
-
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-CCL_NAMESPACE_END
-
-#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
deleted file mode 100644
index 1c0ac5e2cb7..00000000000
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_DEFINES_H__
-#define __FILTER_DEFINES_H__
-
-#define DENOISE_FEATURES 11
-#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES)
-#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2)
-#define XTWY_SIZE (DENOISE_FEATURES + 1)
-
-#define DENOISE_MAX_FRAMES 16
-
-typedef struct TileInfo {
-  int offsets[9];
-  int strides[9];
-  int x[4];
-  int y[4];
-  int from_render;
-  int frames[DENOISE_MAX_FRAMES];
-  int num_frames;
-  /* TODO(lukas): CUDA doesn't have uint64_t... */
-#ifdef __KERNEL_OPENCL__
-  ccl_global float *buffers[9];
-#else
-  long long int buffers[9];
-#endif
-} TileInfo;
-
-#ifdef __KERNEL_OPENCL__
-#  define CCL_FILTER_TILE_INFO \
-    ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \
-        ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \
-        ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \
-        ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \
-        ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9
-#  define CCL_FILTER_TILE_INFO_ARG \
-    tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \
-        tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9
-#  define ccl_get_tile_buffer(id) \
-    (id == 0 ? tile_buffer_1 : \
-     id == 1 ? tile_buffer_2 : \
-     id == 2 ? tile_buffer_3 : \
-     id == 3 ? tile_buffer_4 : \
-     id == 4 ? tile_buffer_5 : \
-     id == 5 ? tile_buffer_6 : \
-     id == 6 ? tile_buffer_7 : \
-     id == 7 ? tile_buffer_8 : \
-               tile_buffer_9)
-#else
-#  ifdef __KERNEL_CUDA__
-#    define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info
-#  else
-#    define CCL_FILTER_TILE_INFO TileInfo *tile_info
-#  endif
-#  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
-#endif
-
-#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
deleted file mode 100644
index 8a2af957146..00000000000
--- a/intern/cycles/kernel/filter/filter_features.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always
- * points to the current pixel in the first pass. Repeat the loop for every secondary frame if
- * there are any. */
-#define FOR_PIXEL_WINDOW \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-
-#define END_FOR_PIXEL_WINDOW \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features(int3 pixel,
-                                           const ccl_global float *ccl_restrict buffer,
-                                           float *features,
-                                           bool use_time,
-                                           const float *ccl_restrict mean,
-                                           int pass_stride)
-{
-  features[0] = pixel.x;
-  features[1] = pixel.y;
-  features[2] = fabsf(ccl_get_feature(buffer, 0));
-  features[3] = ccl_get_feature(buffer, 1);
-  features[4] = ccl_get_feature(buffer, 2);
-  features[5] = ccl_get_feature(buffer, 3);
-  features[6] = ccl_get_feature(buffer, 4);
-  features[7] = ccl_get_feature(buffer, 5);
-  features[8] = ccl_get_feature(buffer, 6);
-  features[9] = ccl_get_feature(buffer, 7);
-  if (use_time) {
-    features[10] = pixel.z;
-  }
-  if (mean) {
-    for (int i = 0; i < (use_time ? 11 : 10); i++) {
-      features[i] -= mean[i];
-    }
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales(int3 pixel,
-                                                 const ccl_global float *ccl_restrict buffer,
-                                                 float *scales,
-                                                 bool use_time,
-                                                 const float *ccl_restrict mean,
-                                                 int pass_stride)
-{
-  scales[0] = fabsf(pixel.x - mean[0]);
-  scales[1] = fabsf(pixel.y - mean[1]);
-  scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
-  scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
-                                      ccl_get_feature(buffer, 2) - mean[4],
-                                      ccl_get_feature(buffer, 3) - mean[5]));
-  scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
-  scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
-                                      ccl_get_feature(buffer, 6) - mean[8],
-                                      ccl_get_feature(buffer, 7) - mean[9]));
-  if (use_time) {
-    scales[6] = fabsf(pixel.z - mean[10]);
-  }
-}
-
-ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
-{
-  scale[0] = 1.0f / max(scale[0], 0.01f);
-  scale[1] = 1.0f / max(scale[1], 0.01f);
-  scale[2] = 1.0f / max(scale[2], 0.01f);
-  if (use_time) {
-    scale[10] = 1.0f / max(scale[6], 0.01f);
-  }
-  scale[6] = 1.0f / max(scale[4], 0.01f);
-  scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f);
-  scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f);
-}
-
-ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
-                                          int pass_stride)
-{
-  return make_float3(
-      ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
-}
-
-ccl_device_inline void design_row_add(float *design_row,
-                                      int rank,
-                                      const ccl_global float *ccl_restrict transform,
-                                      int stride,
-                                      int row,
-                                      float feature,
-                                      int transform_row_stride)
-{
-  for (int i = 0; i < rank; i++) {
-    design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature;
-  }
-}
-
-/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(
-    int3 p_pixel,
-    const ccl_global float *ccl_restrict p_buffer,
-    int3 q_pixel,
-    const ccl_global float *ccl_restrict q_buffer,
-    int pass_stride,
-    int rank,
-    float *design_row,
-    const ccl_global float *ccl_restrict transform,
-    int stride,
-    bool use_time)
-{
-  int num_features = use_time ? 11 : 10;
-
-  design_row[0] = 1.0f;
-  math_vector_zero(design_row + 1, rank);
-
-#define DESIGN_ROW_ADD(I, F) \
-  design_row_add(design_row, rank, transform, stride, I, F, num_features);
-  DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
-  DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
-  DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
-  DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
-  DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
-  DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
-  DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
-  DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
-  DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
-  DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
-  if (use_time) {
-    DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
-  }
-#undef DESIGN_ROW_ADD
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
deleted file mode 100644
index 59d4ace2bef..00000000000
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
- * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set
- * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW_SSE \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    float4 t4 = make_float4(pixel.z); \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      float4 y4 = make_float4(pixel.y); \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-        float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
-        int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features_sse(float4 x,
-                                               float4 y,
-                                               float4 t,
-                                               int4 active_pixels,
-                                               const float *ccl_restrict buffer,
-                                               float4 *features,
-                                               bool use_time,
-                                               const float4 *ccl_restrict mean,
-                                               int pass_stride)
-{
-  int num_features = use_time ? 11 : 10;
-
-  features[0] = x;
-  features[1] = y;
-  features[2] = fabs(ccl_get_feature_sse(0));
-  features[3] = ccl_get_feature_sse(1);
-  features[4] = ccl_get_feature_sse(2);
-  features[5] = ccl_get_feature_sse(3);
-  features[6] = ccl_get_feature_sse(4);
-  features[7] = ccl_get_feature_sse(5);
-  features[8] = ccl_get_feature_sse(6);
-  features[9] = ccl_get_feature_sse(7);
-  if (use_time) {
-    features[10] = t;
-  }
-
-  if (mean) {
-    for (int i = 0; i < num_features; i++) {
-      features[i] = features[i] - mean[i];
-    }
-  }
-  for (int i = 0; i < num_features; i++) {
-    features[i] = mask(active_pixels, features[i]);
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales_sse(float4 x,
-                                                     float4 y,
-                                                     float4 t,
-                                                     int4 active_pixels,
-                                                     const float *ccl_restrict buffer,
-                                                     float4 *scales,
-                                                     bool use_time,
-                                                     const float4 *ccl_restrict mean,
-                                                     int pass_stride)
-{
-  scales[0] = fabs(x - mean[0]);
-  scales[1] = fabs(y - mean[1]);
-  scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
-  scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) +
-              sqr(ccl_get_feature_sse(3) - mean[5]);
-  scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
-  scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) +
-              sqr(ccl_get_feature_sse(7) - mean[9]);
-  if (use_time) {
-    scales[6] = fabs(t - mean[10]);
-  }
-
-  for (int i = 0; i < (use_time ? 7 : 6); i++)
-    scales[i] = mask(active_pixels, scales[i]);
-}
-
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
-{
-  scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
-  scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
-  scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
-  if (use_time) {
-    scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
-  }
-  scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
-  scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
-  scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
deleted file mode 100644
index 2ef03dc0a02..00000000000
--- a/intern/cycles/kernel/filter/filter_kernel.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_color.h"
-#include "util/util_math.h"
-#include "util/util_math_fast.h"
-#include "util/util_texture.h"
-
-#include "util/util_atomic.h"
-#include "util/util_math_matrix.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "kernel/filter/filter_features.h"
-#ifdef __KERNEL_SSE3__
-#  include "kernel/filter/filter_features_sse.h"
-#endif
-
-#include "kernel/filter/filter_prefilter.h"
-
-#ifdef __KERNEL_GPU__
-#  include "kernel/filter/filter_transform_gpu.h"
-#else
-#  ifdef __KERNEL_SSE3__
-#    include "kernel/filter/filter_transform_sse.h"
-#  else
-#    include "kernel/filter/filter_transform.h"
-#  endif
-#endif
-
-#include "kernel/filter/filter_reconstruction.h"
-
-#ifdef __KERNEL_CPU__
-#  include "kernel/filter/filter_nlm_cpu.h"
-#else
-#  include "kernel/filter/filter_nlm_gpu.h"
-#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
deleted file mode 100644
index 24200c29203..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs))))
-#define load4_u(buf, ofs) load_float4((buf) + (ofs))
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx,
-                                                         int dy,
-                                                         const float *ccl_restrict weight_image,
-                                                         const float *ccl_restrict variance_image,
-                                                         const float *ccl_restrict scale_image,
-                                                         float *difference_image,
-                                                         int4 rect,
-                                                         int stride,
-                                                         int channel_offset,
-                                                         int frame_offset,
-                                                         float a,
-                                                         float k_2)
-{
-  /* Strides need to be aligned to 16 bytes. */
-  kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
-
-  int aligned_lowx = rect.x & (~3);
-  const int numChannels = (channel_offset > 0) ? 3 : 1;
-  const float4 channel_fac = make_float4(1.0f / numChannels);
-
-  for (int y = rect.y; y < rect.w; y++) {
-    int idx_p = y * stride + aligned_lowx;
-    int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset;
-    for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
-      float4 diff = make_float4(0.0f);
-      float4 scale_fac;
-      if (scale_image) {
-        scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
-                          make_float4(0.25f),
-                          make_float4(4.0f));
-      }
-      else {
-        scale_fac = make_float4(1.0f);
-      }
-      for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
-        /* idx_p is guaranteed to be aligned, but idx_q isn't. */
-        float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
-        float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs);
-        float4 cdiff = color_p - color_q;
-        float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
-        float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs);
-        diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) /
-                (make_float4(1e-8f) + k_2 * (var_p + var_q));
-      }
-      load4_a(difference_image, idx_p) = diff * channel_fac;
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    const int low = max(rect.y, y - f);
-    const int high = min(rect.w, y + f + 1);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-    for (int y1 = low; y1 < high; y1++) {
-      for (int x = aligned_lowx; x < rect.z; x += 4) {
-        load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x);
-      }
-    }
-    float fac = 1.0f / (high - low);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) *= fac;
-    }
-  }
-}
-
-ccl_device_inline void nlm_blur_horizontal(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-  }
-
-  for (int dx = -f; dx <= f; dx++) {
-    aligned_lowx = round_down(rect.x - min(0, dx), 4);
-    int highx = rect.z - max(0, dx);
-    int4 lowx4 = make_int4(rect.x - min(0, dx));
-    int4 highx4 = make_int4(rect.z - max(0, dx));
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = aligned_lowx; x < highx; x += 4) {
-        int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-        int4 active = (x4 >= lowx4) & (x4 < highx4);
-
-        float4 diff = load4_u(difference_image, y * stride + x + dx);
-        load4_a(out_image, y * stride + x) += mask(active, diff);
-      }
-    }
-  }
-
-  aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
-      float4 low = max(make_float4(rect.x), x4 - make_float4(f));
-      float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1));
-      load4_a(out_image, y * stride + x) *= rcp(high - low);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = fast_expf4(
-          -max(load4_a(out_image, y * stride + x), make_float4(0.0f)));
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int dx,
-                                                       int dy,
-                                                       const float *ccl_restrict difference_image,
-                                                       const float *ccl_restrict image,
-                                                       float *temp_image,
-                                                       float *out_image,
-                                                       float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-      int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
-
-      int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-
-      float4 weight = load4_a(temp_image, idx_p);
-      load4_a(accum_image, idx_p) += mask(active, weight);
-
-      float4 val = load4_u(image, idx_q);
-      if (channel_offset) {
-        val += load4_u(image, idx_q + channel_offset);
-        val += load4_u(image, idx_q + 2 * channel_offset);
-        val *= 1.0f / 3.0f;
-      }
-
-      load4_a(out_image, idx_p) += mask(active, weight * val);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
-                                                           int dy,
-                                                           int t,
-                                                           const float *ccl_restrict
-                                                               difference_image,
-                                                           const float *ccl_restrict buffer,
-                                                           float *transform,
-                                                           int *rank,
-                                                           float *XtWX,
-                                                           float3 *XtWY,
-                                                           int4 rect,
-                                                           int4 filter_window,
-                                                           int stride,
-                                                           int f,
-                                                           int pass_stride,
-                                                           int frame_offset,
-                                                           bool use_time)
-{
-  int4 clip_area = rect_clip(rect, filter_window);
-  /* fy and fy are in filter-window-relative coordinates,
-   * while x and y are in feature-window-relative coordinates. */
-  for (int y = clip_area.y; y < clip_area.w; y++) {
-    for (int x = clip_area.x; x < clip_area.z; x++) {
-      const int low = max(rect.x, x - f);
-      const int high = min(rect.z, x + f + 1);
-      float sum = 0.0f;
-      for (int x1 = low; x1 < high; x1++) {
-        sum += difference_image[y * stride + x1];
-      }
-      float weight = sum * (1.0f / (high - low));
-
-      int storage_ofs = coord_to_local_index(filter_window, x, y);
-      float *l_transform = transform + storage_ofs * TRANSFORM_SIZE;
-      float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE;
-      float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE;
-      int *l_rank = rank + storage_ofs;
-
-      kernel_filter_construct_gramian(x,
-                                      y,
-                                      1,
-                                      dx,
-                                      dy,
-                                      t,
-                                      stride,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time,
-                                      buffer,
-                                      l_transform,
-                                      l_rank,
-                                      weight,
-                                      l_XtWX,
-                                      l_XtWY,
-                                      0);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
-                                                   const float *ccl_restrict accum_image,
-                                                   int4 rect,
-                                                   int w)
-{
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = rect.x; x < rect.z; x++) {
-      out_image[y * w + x] /= accum_image[y * w + x];
-    }
-  }
-}
-
-#undef load4_a
-#undef load4_u
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
deleted file mode 100644
index 650c743f34f..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Determines pixel coordinates and offset for the current thread.
- * Returns whether the thread should do any work.
- *
- * All coordinates are relative to the denoising buffer!
- *
- * Window is the rect that should be processed.
- * co is filled with (x, y, dx, dy).
- */
-ccl_device_inline bool get_nlm_coords_window(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
-{
-  /* Determine the pixel offset that this thread should apply. */
-  int s = 2 * r + 1;
-  int si = ccl_global_id(1);
-  int sx = si % s;
-  int sy = si / s;
-  if (sy >= s) {
-    return false;
-  }
-
-  /* Pixels still need to lie inside the denoising buffer after applying the offset,
-   * so determine the area for which this is the case. */
-  int dx = sx - r;
-  int dy = sy - r;
-
-  *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
-
-  /* Find the intersection of the area that we want to process (window) and the area
-   * that can be processed (rect) to get the final area for this offset. */
-  int4 clip_area = rect_clip(window, *rect);
-
-  /* If the radius is larger than one of the sides of the window,
-   * there will be shifts for which there is no usable pixel at all. */
-  if (!rect_is_valid(clip_area)) {
-    return false;
-  }
-
-  /* Map the linear thread index to pixels inside the clip area. */
-  int x, y;
-  if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
-    return false;
-  }
-
-  *co = make_int4(x, y, dx, dy);
-
-  *ofs = (sy * s + sx) * stride;
-
-  return true;
-}
-
-ccl_device_inline bool get_nlm_coords(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
-{
-  return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    const ccl_global float *ccl_restrict weight_image,
-    const ccl_global float *ccl_restrict variance_image,
-    const ccl_global float *ccl_restrict scale_image,
-    ccl_global float *difference_image,
-    int4 rect,
-    int stride,
-    int channel_offset,
-    int frame_offset,
-    float a,
-    float k_2)
-{
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
-  int numChannels = channel_offset ? 3 : 1;
-
-  float diff = 0.0f;
-  float scale_fac = 1.0f;
-  if (scale_image) {
-    scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
-  }
-
-  for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
-    float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
-    float pvar = variance_image[idx_p];
-    float qvar = sqr(scale_fac) * variance_image[idx_q];
-    diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
-  }
-  if (numChannels > 1) {
-    diff *= 1.0f / numChannels;
-  }
-  difference_image[y * stride + x] = diff;
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(int x,
-                                              int y,
-                                              const ccl_global float *ccl_restrict
-                                                  difference_image,
-                                              ccl_global float *out_image,
-                                              int4 rect,
-                                              int stride,
-                                              int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.y, y - f);
-  const int high = min(rect.w, y + f + 1);
-  for (int y1 = low; y1 < high; y1++) {
-    sum += difference_image[y1 * stride + x];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = sum;
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
-                                                     int y,
-                                                     const ccl_global float *ccl_restrict
-                                                         difference_image,
-                                                     ccl_global float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int x,
-                                                       int y,
-                                                       int dx,
-                                                       int dy,
-                                                       const ccl_global float *ccl_restrict
-                                                           difference_image,
-                                                       const ccl_global float *ccl_restrict image,
-                                                       ccl_global float *out_image,
-                                                       ccl_global float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-  if (out_image) {
-    atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
-    float val = image[idx_q];
-    if (channel_offset) {
-      val += image[idx_q + channel_offset];
-      val += image[idx_q + 2 * channel_offset];
-      val *= 1.0f / 3.0f;
-    }
-    atomic_add_and_fetch_float(out_image + idx_p, sum * val);
-  }
-  else {
-    accum_image[idx_p] = sum;
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    int t,
-    const ccl_global float *ccl_restrict difference_image,
-    const ccl_global float *ccl_restrict buffer,
-    const ccl_global float *ccl_restrict transform,
-    ccl_global int *rank,
-    ccl_global float *XtWX,
-    ccl_global float3 *XtWY,
-    int4 rect,
-    int4 filter_window,
-    int stride,
-    int f,
-    int pass_stride,
-    int frame_offset,
-    bool use_time,
-    int localIdx)
-{
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  float sum = 0.0f;
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  float weight = sum * (1.0f / (high - low));
-
-  /* Reconstruction data is only stored for pixels inside the filter window,
-   * so compute the pixels's index in there. */
-  int storage_ofs = coord_to_local_index(filter_window, x, y);
-  transform += storage_ofs;
-  rank += storage_ofs;
-  XtWX += storage_ofs;
-  XtWY += storage_ofs;
-
-  kernel_filter_construct_gramian(x,
-                                  y,
-                                  rect_size(filter_window),
-                                  dx,
-                                  dy,
-                                  t,
-                                  stride,
-                                  pass_stride,
-                                  frame_offset,
-                                  use_time,
-                                  buffer,
-                                  transform,
-                                  rank,
-                                  weight,
-                                  XtWX,
-                                  XtWY,
-                                  localIdx);
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(int x,
-                                                   int y,
-                                                   ccl_global float *out_image,
-                                                   const ccl_global float *ccl_restrict
-                                                       accum_image,
-                                                   int stride)
-{
-  out_image[y * stride + x] /= accum_image[y * stride + x];
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
deleted file mode 100644
index 97cecba190e..00000000000
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/**
- * First step of the shadow prefiltering, performs the shadow division and stores all data
- * in a nice and easy rectangular array that can be passed to the NLM filter.
- *
- * Calculates:
- * \param unfiltered: Contains the two half images of the shadow feature pass
- * \param sampleVariance: The sample-based variance calculated in the kernel.
- * Note: This calculation is biased in general,
- * and especially here since the variance of the ratio can only be approximated.
- * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy
- * (since it's essentially the buffer variance of the two variance halves)
- * \param bufferVariance: The buffer-based variance of the shadow feature.
- * Unbiased, but quite noisy.
- */
-ccl_device void kernel_filter_divide_shadow(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int x,
-                                            int y,
-                                            ccl_global float *unfilteredA,
-                                            ccl_global float *unfilteredB,
-                                            ccl_global float *sampleVariance,
-                                            ccl_global float *sampleVarianceV,
-                                            ccl_global float *bufferVariance,
-                                            int4 rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-
-  int offset = tile_info->offsets[tile];
-  int stride = tile_info->strides[tile];
-  const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer(
-      tile);
-  center_buffer += (y * stride + x + offset) * buffer_pass_stride;
-  center_buffer += buffer_denoising_offset + 14;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-  unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
-  unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
-
-  float varA = center_buffer[2];
-  float varB = center_buffer[5];
-  int odd_sample = (sample + 1) / 2;
-  int even_sample = sample / 2;
-
-  /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-   * update does not work efficiently with atomics in the kernel. */
-  varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample);
-  varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample);
-
-  varA /= max(odd_sample - 1, 1);
-  varB /= max(even_sample - 1, 1);
-
-  sampleVariance[idx] = 0.5f * (varA + varB) / sample;
-  sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample);
-  bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) *
-                        (unfilteredA[idx] - unfilteredB[idx]);
-}
-
-/* Load a regular feature from the render buffers into the denoise buffer.
- * Parameters:
- * - sample: The sample amount in the buffer, used to normalize the buffer.
- * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
- * - x, y: Current pixel
- * - mean, variance: Target denoise buffers.
- * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
- */
-ccl_device void kernel_filter_get_feature(int sample,
-                                          CCL_FILTER_TILE_INFO,
-                                          int m_offset,
-                                          int v_offset,
-                                          int x,
-                                          int y,
-                                          ccl_global float *mean,
-                                          ccl_global float *variance,
-                                          float scale,
-                                          int4 rect,
-                                          int buffer_pass_stride,
-                                          int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-  ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) +
-                                    (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) *
-                                        buffer_pass_stride +
-                                    buffer_denoising_offset;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float val = scale * center_buffer[m_offset];
-  mean[idx] = val;
-
-  if (v_offset >= 0) {
-    if (sample > 1) {
-      /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-       * update does not work efficiently with atomics in the kernel. */
-      variance[idx] = max(
-          0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1)));
-    }
-    else {
-      /* Can't compute variance with single sample, just set it very high. */
-      variance[idx] = 1e10f;
-    }
-  }
-}
-
-ccl_device void kernel_filter_write_feature(int sample,
-                                            int x,
-                                            int y,
-                                            int4 buffer_params,
-                                            ccl_global float *from,
-                                            ccl_global float *buffer,
-                                            int out_offset,
-                                            int4 rect)
-{
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  combined_buffer[out_offset] = from[idx];
-}
-
-#define GET_COLOR(image) \
-  make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride])
-#define SET_COLOR(image, color) \
-  image[idx] = color.x; \
-  image[idx + pass_stride] = color.y; \
-  image[idx + 2 * pass_stride] = color.z
-
-ccl_device void kernel_filter_detect_outliers(int x,
-                                              int y,
-                                              ccl_global float *in,
-                                              ccl_global float *variance_out,
-                                              ccl_global float *depth,
-                                              ccl_global float *image_out,
-                                              int4 rect,
-                                              int pass_stride)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  ccl_global float *image_in = in;
-  ccl_global float *variance_in = in + 3 * pass_stride;
-
-  int n = 0;
-  float values[25];
-  float pixel_variance, max_variance = 0.0f;
-  for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
-    for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
-      int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
-      float3 color = GET_COLOR(image_in);
-      color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-      float L = average(color);
-
-      /* Find the position of L. */
-      int i;
-      for (i = 0; i < n; i++) {
-        if (values[i] > L)
-          break;
-      }
-      /* Make space for L by shifting all following values to the right. */
-      for (int j = n; j > i; j--) {
-        values[j] = values[j - 1];
-      }
-      /* Insert L. */
-      values[i] = L;
-      n++;
-
-      float3 pixel_var = GET_COLOR(variance_in);
-      float var = average(pixel_var);
-      if ((x1 == x) && (y1 == y)) {
-        pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
-                                                                                            var;
-      }
-      else {
-        max_variance = max(max_variance, var);
-      }
-    }
-  }
-
-  max_variance += 1e-4f;
-
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float3 color = GET_COLOR(image_in);
-  float3 variance = GET_COLOR(variance_in);
-  color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-  variance = max(variance, make_float3(0.0f, 0.0f, 0.0f));
-
-  float L = average(color);
-
-  float ref = 2.0f * values[(int)(n * 0.75f)];
-
-  /* Slightly offset values to avoid false positives in (almost) black areas. */
-  max_variance += 1e-5f;
-  ref -= 1e-5f;
-
-  if (L > ref) {
-    /* The pixel appears to be an outlier.
-     * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is
-     * that the pixel should actually be at the reference value: If the reference is within the
-     * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very
-     * unlikely that the pixel should be darker, which indicates a legitimate highlight.
-     */
-
-    if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
-      depth[idx] = -depth[idx];
-      color *= ref / L;
-      variance = make_float3(max_variance, max_variance, max_variance);
-    }
-    else {
-      float stddev = sqrtf(pixel_variance);
-      if (L - 3 * stddev < ref) {
-        /* The pixel is an outlier, so negate the depth value to mark it as one.
-         * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM
-         * weights. */
-        depth[idx] = -depth[idx];
-        float fac = ref / L;
-        color *= fac;
-        variance *= sqr(fac);
-      }
-    }
-  }
-
-  /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results.
-   * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we
-   * scale by the square of that (since we have variance instead of standard deviation). */
-  color = color_highlight_compress(color, &variance);
-
-  SET_COLOR(image_out, color);
-  SET_COLOR(variance_out, variance);
-}
-
-#undef GET_COLOR
-#undef SET_COLOR
-
-/* Combine A/B buffers.
- * Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x,
-                                             int y,
-                                             ccl_global float *mean,
-                                             ccl_global float *variance,
-                                             ccl_global float *a,
-                                             ccl_global float *b,
-                                             int4 rect,
-                                             int r)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  if (mean)
-    mean[idx] = 0.5f * (a[idx] + b[idx]);
-  if (variance) {
-    if (r == 0)
-      variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]);
-    else {
-      variance[idx] = 0.0f;
-      float values[25];
-      int numValues = 0;
-      for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) {
-        for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) {
-          int pidx = (py - rect.y) * buffer_w + (px - rect.x);
-          values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]);
-        }
-      }
-      /* Insertion-sort the variances (fast enough for 25 elements). */
-      for (int i = 1; i < numValues; i++) {
-        float v = values[i];
-        int j;
-        for (j = i - 1; j >= 0 && values[j] > v; j--)
-          values[j + 1] = values[j];
-        values[j + 1] = v;
-      }
-      variance[idx] = values[(7 * numValues) / 8];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
deleted file mode 100644
index 17941689ad5..00000000000
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_filter_construct_gramian(int x,
-                                                       int y,
-                                                       int storage_stride,
-                                                       int dx,
-                                                       int dy,
-                                                       int t,
-                                                       int buffer_stride,
-                                                       int pass_stride,
-                                                       int frame_offset,
-                                                       bool use_time,
-                                                       const ccl_global float *ccl_restrict buffer,
-                                                       const ccl_global float *ccl_restrict
-                                                           transform,
-                                                       ccl_global int *rank,
-                                                       float weight,
-                                                       ccl_global float *XtWX,
-                                                       ccl_global float3 *XtWY,
-                                                       int localIdx)
-{
-  if (weight < 1e-3f) {
-    return;
-  }
-
-  int p_offset = y * buffer_stride + x;
-  int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;
-
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
-#else
-  float design_row[DENOISE_FEATURES + 1];
-#endif
-
-  float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
-
-  /* If the pixel was flagged as an outlier during prefiltering, skip it. */
-  if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
-    return;
-  }
-
-  filter_get_design_row_transform(make_int3(x, y, t),
-                                  buffer + p_offset,
-                                  make_int3(x + dx, y + dy, t),
-                                  buffer + q_offset,
-                                  pass_stride,
-                                  *rank,
-                                  design_row,
-                                  transform,
-                                  stride,
-                                  use_time);
-
-#ifdef __KERNEL_GPU__
-  math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
-  math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
-#else
-  math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
-  math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
-#endif
-}
-
-ccl_device_inline void kernel_filter_finalize(int x,
-                                              int y,
-                                              ccl_global float *buffer,
-                                              ccl_global int *rank,
-                                              int storage_stride,
-                                              ccl_global float *XtWX,
-                                              ccl_global float3 *XtWY,
-                                              int4 buffer_params,
-                                              int sample)
-{
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-  if (XtWX[0] < 1e-3f) {
-    /* There is not enough information to determine a denoised result.
-     * As a fallback, keep the original value of the pixel. */
-    return;
-  }
-
-  /* The weighted average of pixel colors (essentially, the NLM-filtered image).
-   * In case the solution of the linear model fails due to numerical issues or
-   * returns nonsensical negative values, fall back to this value. */
-  float3 mean_color = XtWY[0] / XtWX[0];
-
-  math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
-
-  float3 final_color = XtWY[0];
-  if (!isfinite3_safe(final_color) ||
-      (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
-    final_color = mean_color;
-  }
-
-  /* Clamp pixel value to positive values and reverse the highlight compression transform. */
-  final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f)));
-
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-  if (buffer_params.w >= 0) {
-    final_color *= sample;
-    if (buffer_params.w > 0) {
-      final_color.x += combined_buffer[buffer_params.w + 0];
-      final_color.y += combined_buffer[buffer_params.w + 1];
-      final_color.z += combined_buffer[buffer_params.w + 2];
-    }
-  }
-  combined_buffer[0] = final_color.x;
-  combined_buffer[1] = final_color.y;
-  combined_buffer[2] = final_color.z;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
deleted file mode 100644
index 880a661214e..00000000000
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float features[DENOISE_FEATURES];
-
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < (*rank); i++) {
-    math_vector_mul(transform + i * num_features, feature_scale, num_features);
-  }
-  math_matrix_transpose(transform, num_features, 1);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
deleted file mode 100644
index ec258a5212a..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  ccl_global float *transform,
-                                                  ccl_global int *rank,
-                                                  int radius,
-                                                  float pca_threshold,
-                                                  int transform_stride,
-                                                  int localIdx)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES;
-#else
-  float features[DENOISE_FEATURES];
-#endif
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-  const ccl_global float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, transform_stride);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    for (int j = 0; j < (*rank); j++) {
-      transform[(i * num_features + j) * transform_stride] *= feature_scale[i];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
deleted file mode 100644
index 0304d990f9f..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float4 features[DENOISE_FEATURES];
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float4 feature_means[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_means, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add_sse(feature_means, num_features, features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float4 pixel_scale = make_float4(1.0f / num_pixels);
-  for (int i = 0; i < num_features; i++) {
-    feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
-  }
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float4 feature_scale[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_scale, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_feature_scales_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max_sse(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  filter_calculate_scale_sse(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero_sse(feature_matrix_sse, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul_sse(features, num_features, feature_scale);
-    math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, 1);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 5ff4d5f7053..4de824cc277 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/geom/geom_attribute.h"
 #include "kernel/geom/geom_object.h"
@@ -31,4 +33,5 @@
 #include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
+#include "kernel/geom/geom_shader_data.h"
 // clang-format on
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index b37797ac21b..9532a21fec7 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Attributes
@@ -25,9 +27,9 @@ CCL_NAMESPACE_BEGIN
  * Lookup of attributes is different between OSL and SVM, as OSL is ustring
  * based while for SVM we use integer ids. */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd);
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd);
 
-ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint attribute_primitive_type(const KernelGlobals *kg, const ShaderData *sd)
 {
   if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
     return ATTR_PRIM_SUBD;
@@ -46,12 +48,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 /* Find attribute based on ID */
 
-ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_attribute_map_offset(const KernelGlobals *kg, int object)
 {
   return kernel_tex_fetch(__objects, object).attribute_map_offset;
 }
 
-ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
+ccl_device_inline AttributeDescriptor find_attribute(const KernelGlobals *kg,
                                                      const ShaderData *sd,
                                                      uint id)
 {
@@ -98,7 +100,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
 
 /* Transform matrix attribute on meshes */
 
-ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg,
+ccl_device Transform primitive_attribute_matrix(const KernelGlobals *kg,
                                                 const ShaderData *sd,
                                                 const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b5a62a31ca9..a827a67ce7a 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
@@ -25,8 +27,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Reading attributes on various curve elements */
 
-ccl_device float curve_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float curve_attribute_float(const KernelGlobals *kg,
+                                       const ShaderData *sd,
+                                       const AttributeDescriptor desc,
+                                       float *dx,
+                                       float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
     float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
@@ -64,7 +69,7 @@ ccl_device float curve_attribute_float(
   }
 }
 
-ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
+ccl_device float2 curve_attribute_float2(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float2 *dx,
@@ -110,7 +115,7 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
+ccl_device float3 curve_attribute_float3(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float3 *dx,
@@ -152,7 +157,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
+ccl_device float4 curve_attribute_float4(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float4 *dx,
@@ -196,7 +201,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
 
 /* Curve thickness */
 
-ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+ccl_device float curve_thickness(const KernelGlobals *kg, const ShaderData *sd)
 {
   float r = 0.0f;
 
@@ -224,7 +229,7 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 /* Curve location for motion pass, linear interpolation between keys and
  * ignoring radius because we do the same for the motion keys */
 
-ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_motion_center_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
   int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -240,7 +245,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 
 /* Curve tangent normal */
 
-ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_tangent_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 tgN = make_float3(0.0f, 0.0f, 0.0f);
 
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index e25bf5b4660..213f3e62ee0 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve primitive intersection functions.
@@ -167,6 +169,7 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
 }
 
 ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+                                          float *ray_tfar,
                                           const float dt,
                                           const float4 curve[4],
                                           float u,
@@ -230,7 +233,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
     if (fabsf(f) < f_err && fabsf(g) < g_err) {
       t += dt;
-      if (!(0.0f <= t && t <= isect->t)) {
+      if (!(0.0f <= t && t <= *ray_tfar)) {
         return false; /* Rejects NaNs */
       }
       if (!(u >= 0.0f && u <= 1.0f)) {
@@ -247,6 +250,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
       }
 
       /* Record intersection. */
+      *ray_tfar = t;
       isect->t = t;
       isect->u = u;
       isect->v = 0.0f;
@@ -259,6 +263,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
 ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           const float3 ray_dir,
+                                          float ray_tfar,
                                           float4 curve[4],
                                           Intersection *isect)
 {
@@ -339,7 +344,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       }
 
       /* Intersect with cap-planes. */
-      float2 tp = make_float2(-dt, isect->t - dt);
+      float2 tp = make_float2(-dt, ray_tfar - dt);
       tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
       const float2 h0 = half_plane_intersect(
           float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
@@ -402,19 +407,19 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
         }
         else {
           recurse = true;
         }
       }
 
-      if (valid1 && (tp1.x + dt <= isect->t)) {
+      if (valid1 && (tp1.x + dt <= ray_tfar)) {
         const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
         }
         else {
           recurse = true;
@@ -542,7 +547,7 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
 
 ccl_device_inline bool ribbon_intersect(const float3 ray_org,
                                         const float3 ray_dir,
-                                        const float ray_tfar,
+                                        float ray_tfar,
                                         const int N,
                                         float4 curve[4],
                                         Intersection *isect)
@@ -590,7 +595,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
 
       /* Intersect quad. */
       float vu, vv, vt;
-      bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt);
+      bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt);
 
       if (valid0) {
         /* ignore self intersections */
@@ -604,6 +609,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
           vv = 2.0f * vv - 1.0f;
 
           /* Record intersection. */
+          ray_tfar = vt;
           isect->t = vt;
           isect->u = u + vu * step_size;
           isect->v = vv;
@@ -619,10 +625,11 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
   return false;
 }
 
-ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
+ccl_device_forceinline bool curve_intersect(const KernelGlobals *kg,
                                             Intersection *isect,
                                             const float3 P,
                                             const float3 dir,
+                                            const float tmax,
                                             uint visibility,
                                             int object,
                                             int curveAddr,
@@ -672,7 +679,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* todo: adaptive number of subdivisions could help performance here. */
     const int subdivisions = kernel_data.bvh.curve_subdivisions;
-    if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) {
+    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -682,7 +689,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
     return false;
   }
   else {
-    if (curve_intersect_recursive(P, dir, curve, isect)) {
+    if (curve_intersect_recursive(P, dir, tmax, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -693,28 +700,23 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
+ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
                                           ShaderData *sd,
-                                          const Intersection *isect,
-                                          const Ray *ray)
+                                          float3 P,
+                                          float3 D,
+                                          float t,
+                                          const int isect_object,
+                                          const int isect_prim)
 {
-  float t = isect->t;
-  float3 P = ray->P;
-  float3 D = ray->D;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
     D = normalize_len(D, &t);
   }
 
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  int prim = kernel_tex_fetch(__prim_index, isect_prim);
   float4 v00 = kernel_tex_fetch(__curves, prim);
 
   int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -735,23 +737,20 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
   }
 
-  sd->u = isect->u;
-
   P = P + D * t;
 
-  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u);
+  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u);
   const float3 dPdu = float4_to_float3(dPdu4);
 
   if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
     const float3 tangent = normalize(dPdu);
     const float3 bitangent = normalize(cross(tangent, -D));
-    const float sine = isect->v;
+    const float sine = sd->v;
     const float cosine = safe_sqrtf(1.0f - sine * sine);
 
     sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
     sd->Ng = -D;
-    sd->v = isect->v;
 
 #  if 0
     /* This approximates the position and geometric normal of a thick curve too,
@@ -765,7 +764,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     /* Thick curves, compute normal using direction from inside the curve.
      * This could be optimized by recording the normal in the intersection,
      * however for Optix this would go beyond the size of the payload. */
-    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
+    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
     const float3 Ng = normalize(P - P_inside);
 
     sd->N = Ng;
@@ -779,13 +778,8 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
   sd->dPdv = cross(dPdu, sd->Ng);
 #  endif
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 0f66f4af755..5294da03145 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Motion Curve Primitive
@@ -25,7 +27,7 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
-ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_curve_motion(const KernelGlobals *kg,
                                                   int object,
                                                   uint id,
                                                   AttributeElement *elem)
@@ -50,7 +52,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step_linear(const KernelGlobals *kg,
                                                          int offset,
                                                          int numkeys,
                                                          int numsteps,
@@ -78,7 +80,7 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
 
 /* return 2 curve key locations */
 ccl_device_inline void motion_curve_keys_linear(
-    KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
+    const KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
 {
   /* get motion info */
   int numsteps, numkeys;
@@ -105,7 +107,7 @@ ccl_device_inline void motion_curve_keys_linear(
   keys[1] = (1.0f - t) * keys[1] + t * next_keys[1];
 }
 
-ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step(const KernelGlobals *kg,
                                                   int offset,
                                                   int numkeys,
                                                   int numsteps,
@@ -138,7 +140,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_curve_keys(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys(const KernelGlobals *kg,
                                          int object,
                                          int prim,
                                          float time,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 53d6b92dd7e..eb4a39e062b 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -25,11 +25,13 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Time interpolation of vertex positions and normals */
 
-ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
                                             int object,
                                             uint id,
                                             AttributeElement *elem)
@@ -49,7 +51,7 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_verts_for_step(const KernelGlobals *kg,
                                                       uint4 tri_vindex,
                                                       int offset,
                                                       int numverts,
@@ -76,7 +78,7 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_normals_for_step(const KernelGlobals *kg,
                                                         uint4 tri_vindex,
                                                         int offset,
                                                         int numverts,
@@ -104,7 +106,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
 }
 
 ccl_device_inline void motion_triangle_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
 {
   /* get motion info */
   int numsteps, numverts;
@@ -134,7 +136,7 @@ ccl_device_inline void motion_triangle_vertices(
 }
 
 ccl_device_inline float3 motion_triangle_smooth_normal(
-    KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
+    const KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
 {
   /* get motion info */
   int numsteps, numverts;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index 859d919f0bb..ec7e4b07d76 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
@@ -32,23 +34,21 @@ CCL_NAMESPACE_BEGIN
  * a closer distance.
  */
 
-ccl_device_inline float3 motion_triangle_refine(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+ccl_device_inline float3 motion_triangle_refine(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float3 P,
+                                                float3 D,
+                                                float t,
+                                                const int isect_object,
+                                                const int isect_prim,
+                                                float3 verts[3])
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -70,13 +70,8 @@ ccl_device_inline float3 motion_triangle_refine(
   /* Compute refined position. */
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -86,7 +81,7 @@ ccl_device_inline float3 motion_triangle_refine(
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space
+/* Same as above, except that t is assumed to be in object space
  * for instancing.
  */
 
@@ -97,27 +92,22 @@ ccl_device_noinline
 ccl_device_inline
 #  endif
     float3
-    motion_triangle_refine_local(KernelGlobals *kg,
+    motion_triangle_refine_local(const KernelGlobals *kg,
                                  ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray,
+                                 float3 P,
+                                 float3 D,
+                                 float t,
+                                 const int isect_object,
+                                 const int isect_prim,
                                  float3 verts[3])
 {
 #  ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return motion_triangle_refine(kg, sd, isect, ray, verts);
+  /* t is always in world space with OptiX. */
+  return motion_triangle_refine(kg, sd, P, D, t, isect_object, isect_prim, verts);
 #  else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #    ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#      endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -138,13 +128,8 @@ ccl_device_inline
 
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#      endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -160,10 +145,11 @@ ccl_device_inline
  * time and do a ray intersection with the resulting triangle.
  */
 
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect(const KernelGlobals *kg,
                                                  Intersection *isect,
                                                  float3 P,
                                                  float3 dir,
+                                                 float tmax,
                                                  float time,
                                                  uint visibility,
                                                  int object,
@@ -179,7 +165,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              (ssef *)verts,
 #else
@@ -215,7 +201,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
  * Returns whether traversal should be stopped.
  */
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect_local(const KernelGlobals *kg,
                                                        LocalIntersection *local_isect,
                                                        float3 P,
                                                        float3 dir,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 7a91f8041f7..85c4f0ca522 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Setup of motion triangle specific parts of ShaderData, moved into this one
@@ -32,8 +34,14 @@ CCL_NAMESPACE_BEGIN
  * normals */
 
 /* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool is_local)
+ccl_device_noinline void motion_triangle_shader_setup(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      const float3 P,
+                                                      const float3 D,
+                                                      const float ray_t,
+                                                      const int isect_object,
+                                                      const int isect_prim,
+                                                      bool is_local)
 {
   /* Get shader. */
   sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
@@ -63,12 +71,12 @@ ccl_device_noinline void motion_triangle_shader_setup(
   /* Compute refined position. */
 #ifdef __BVH_LOCAL__
   if (is_local) {
-    sd->P = motion_triangle_refine_local(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine_local(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   else
 #endif /* __BVH_LOCAL__*/
   {
-    sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   /* Compute face normal. */
   float3 Ng;
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index fe73335a335..7d6ad7b4fe3 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -22,6 +22,8 @@
  * directly primitives in the BVH with world space locations applied, and the object
  * ID is looked up afterwards. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Object attributes, for now a fixed size and contents */
@@ -35,7 +37,7 @@ enum ObjectVectorTransform { OBJECT_PASS_MOTION_PRE = 0, OBJECT_PASS_MOTION_POST
 
 /* Object to world space transformation */
 
-ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform(const KernelGlobals *kg,
                                                    int object,
                                                    enum ObjectTransform type)
 {
@@ -49,7 +51,7 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
 
 /* Lamp to world space transformation */
 
-ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
+ccl_device_inline Transform lamp_fetch_transform(const KernelGlobals *kg, int lamp, bool inverse)
 {
   if (inverse) {
     return kernel_tex_fetch(__lights, lamp).itfm;
@@ -61,7 +63,7 @@ ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bo
 
 /* Object to world space transformation for motion vectors */
 
-ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_motion_pass_transform(const KernelGlobals *kg,
                                                                int object,
                                                                enum ObjectVectorTransform type)
 {
@@ -72,7 +74,7 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg
 /* Motion blurred object transformations */
 
 #ifdef __OBJECT_MOTION__
-ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion(const KernelGlobals *kg,
                                                           int object,
                                                           float time)
 {
@@ -86,7 +88,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
   return tfm;
 }
 
-ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion_test(const KernelGlobals *kg,
                                                                int object,
                                                                float time,
                                                                Transform *itfm)
@@ -111,45 +113,79 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 }
 #endif
 
+/* Get transform matrix for shading point. */
+
+ccl_device_inline Transform object_get_transform(const KernelGlobals *kg, const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_tfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#endif
+}
+
+ccl_device_inline Transform object_get_inverse_transform(const KernelGlobals *kg,
+                                                         const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_itfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+}
 /* Transform position from object to world space */
 
-ccl_device_inline void object_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_position_transform(const KernelGlobals *kg,
                                                  const ShaderData *sd,
                                                  float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_tfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_tfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform position from world to object space */
 
-ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_position_transform(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_itfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_itfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform normal from world to object space */
 
-ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_normal_transform(const KernelGlobals *kg,
                                                        const ShaderData *sd,
                                                        float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
-    *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+      *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm_motion, *N));
+    }
+    return;
   }
-#else
+#endif
+
   if (sd->object != OBJECT_NONE) {
     Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
     *N = normalize(transform_direction_transposed(&tfm, *N));
@@ -158,65 +194,79 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
     Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
     *N = normalize(transform_direction_transposed(&tfm, *N));
   }
-#endif
 }
 
 /* Transform normal from object to world space */
 
-ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
+ccl_device_inline void object_normal_transform(const KernelGlobals *kg,
+                                               const ShaderData *sd,
+                                               float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm_motion, *N));
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *N = normalize(transform_direction_transposed(&tfm, *N));
-#endif
 }
 
 /* Transform direction vector from object to world space */
 
-ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
+ccl_device_inline void object_dir_transform(const KernelGlobals *kg,
+                                            const ShaderData *sd,
+                                            float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_tfm, *D);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_tfm_motion, *D);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *D = transform_direction(&tfm, *D);
-#endif
 }
 
 /* Transform direction vector from world to object space */
 
-ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_dir_transform(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_itfm, *D);
-#else
-  Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  *D = transform_direction(&tfm, *D);
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_itfm_motion, *D);
+    return;
+  }
 #endif
+
+  const Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+  *D = transform_direction(&tfm, *D);
 }
 
 /* Object center position */
 
-ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline float3 object_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-  return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    return make_float3(sd->ob_tfm_motion.x.w, sd->ob_tfm_motion.y.w, sd->ob_tfm_motion.z.w);
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-#endif
 }
 
 /* Color of the object */
 
-ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_color(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -227,7 +277,7 @@ ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
 
 /* Pass ID number of object */
 
-ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_pass_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -237,7 +287,7 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 
 /* Per lamp random number for shader variation */
 
-ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
+ccl_device_inline float lamp_random_number(const KernelGlobals *kg, int lamp)
 {
   if (lamp == LAMP_NONE)
     return 0.0f;
@@ -247,7 +297,7 @@ ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
 
 /* Per object random number for shader variation */
 
-ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
+ccl_device_inline float object_random_number(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -257,7 +307,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 
 /* Particle ID from which this object was generated */
 
-ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
+ccl_device_inline int object_particle_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -267,7 +317,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 
 /* Generated texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_generated(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -279,7 +329,7 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 
 /* UV texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_uv(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -291,7 +341,7 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 /* Information about mesh for motion blurred triangles and curves */
 
 ccl_device_inline void object_motion_info(
-    KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
+    const KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
 {
   if (numkeys) {
     *numkeys = kernel_tex_fetch(__objects, object).numkeys;
@@ -305,7 +355,7 @@ ccl_device_inline void object_motion_info(
 
 /* Offset to an objects patch map */
 
-ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_patch_map_offset(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -315,7 +365,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 /* Volume step size */
 
-ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_density(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return 1.0f;
@@ -324,7 +374,7 @@ ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).volume_density;
 }
 
-ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_step_size(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return kernel_data.background.volume_step_size;
@@ -335,14 +385,14 @@ ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
 
 /* Pass ID for shader */
 
-ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
+ccl_device int shader_pass_id(const KernelGlobals *kg, const ShaderData *sd)
 {
   return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
 /* Cryptomatte ID */
 
-ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -350,7 +400,7 @@ ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).cryptomatte_object;
 }
 
-ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_asset_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -360,42 +410,42 @@ ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int objec
 
 /* Particle data from which object was instanced */
 
-ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
+ccl_device_inline uint particle_index(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).index;
 }
 
-ccl_device float particle_age(KernelGlobals *kg, int particle)
+ccl_device float particle_age(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).age;
 }
 
-ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
+ccl_device float particle_lifetime(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).lifetime;
 }
 
-ccl_device float particle_size(KernelGlobals *kg, int particle)
+ccl_device float particle_size(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).size;
 }
 
-ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
+ccl_device float4 particle_rotation(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).rotation;
 }
 
-ccl_device float3 particle_location(KernelGlobals *kg, int particle)
+ccl_device float3 particle_location(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).location);
 }
 
-ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity);
 }
 
-ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_angular_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity);
 }
@@ -418,7 +468,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 /* Transform ray into object space to enter static object in BVH */
 
 ccl_device_inline float bvh_instance_push(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+    const KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir)
 {
   Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -428,17 +478,18 @@ ccl_device_inline float bvh_instance_push(
   *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit static object in BVH. */
 
-ccl_device_inline float bvh_instance_pop(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+ccl_device_inline float bvh_instance_pop(const KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
   if (t != FLT_MAX) {
     Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@@ -454,7 +505,7 @@ ccl_device_inline float bvh_instance_pop(
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_pop_factor(const KernelGlobals *kg,
                                                int object,
                                                const Ray *ray,
                                                float3 *P,
@@ -473,13 +524,12 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(const KernelGlobals *kg,
                                                  int object,
                                                  const Ray *ray,
                                                  float3 *P,
                                                  float3 *dir,
                                                  float3 *idir,
-                                                 float t,
                                                  Transform *itfm)
 {
   object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -490,16 +540,12 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
   *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit motion blurred object in BVH. */
 
-ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_pop(const KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
@@ -521,7 +567,7 @@ ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_motion_pop_factor(const KernelGlobals *kg,
                                                       int object,
                                                       const Ray *ray,
                                                       float3 *P,
@@ -538,48 +584,11 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
 
 #endif
 
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
 
-#ifdef __KERNEL_OPENCL__
-ccl_device_inline void object_position_transform_addrspace(KernelGlobals *kg,
-                                                           const ShaderData *sd,
-                                                           ccl_addr_space float3 *P)
-{
-  float3 private_P = *P;
-  object_position_transform(kg, sd, &private_P);
-  *P = private_P;
-}
-
-ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
-                                                      const ShaderData *sd,
-                                                      ccl_addr_space float3 *D)
-{
-  float3 private_D = *D;
-  object_dir_transform(kg, sd, &private_D);
-  *D = private_D;
-}
-
-ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
-                                                         const ShaderData *sd,
-                                                         ccl_addr_space float3 *N)
-{
-  float3 private_N = *N;
-  object_normal_transform(kg, sd, &private_N);
-  *N = private_N;
-}
-#endif
-
-#ifndef __KERNEL_OPENCL__
-#  define object_position_transform_auto object_position_transform
-#  define object_dir_transform_auto object_dir_transform
-#  define object_normal_transform_auto object_normal_transform
-#else
-#  define object_position_transform_auto object_position_transform_addrspace
-#  define object_dir_transform_auto object_dir_transform_addrspace
-#  define object_normal_transform_auto object_normal_transform_addrspace
-#endif
+#define object_position_transform_auto object_position_transform
+#define object_dir_transform_auto object_dir_transform
+#define object_normal_transform_auto object_normal_transform
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 9c1768f05db..ce0fc15f196 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -24,6 +24,8 @@
  * language governing permissions and limitations under the Apache License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 typedef struct PatchHandle {
@@ -60,7 +62,7 @@ ccl_device_inline int patch_map_resolve_quadrant(float median, float *u, float *
 /* retrieve PatchHandle from patch coords */
 
 ccl_device_inline PatchHandle
-patch_map_find_patch(KernelGlobals *kg, int object, int patch, float u, float v)
+patch_map_find_patch(const KernelGlobals *kg, int object, int patch, float u, float v)
 {
   PatchHandle handle;
 
@@ -191,7 +193,7 @@ ccl_device_inline void patch_eval_normalize_coords(uint patch_bits, float *u, fl
 
 /* retrieve patch control indices */
 
-ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
+ccl_device_inline int patch_eval_indices(const KernelGlobals *kg,
                                          const PatchHandle *handle,
                                          int channel,
                                          int indices[PATCH_MAX_CONTROL_VERTS])
@@ -208,7 +210,7 @@ ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
 
 /* evaluate patch basis functions */
 
-ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
+ccl_device_inline void patch_eval_basis(const KernelGlobals *kg,
                                         const PatchHandle *handle,
                                         float u,
                                         float v,
@@ -247,7 +249,7 @@ ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
 
 /* generic function for evaluating indices and weights from patch coords */
 
-ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
+ccl_device_inline int patch_eval_control_verts(const KernelGlobals *kg,
                                                int object,
                                                int patch,
                                                float u,
@@ -269,7 +271,7 @@ ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
 
 /* functions for evaluating attributes on patches */
 
-ccl_device float patch_eval_float(KernelGlobals *kg,
+ccl_device float patch_eval_float(const KernelGlobals *kg,
                                   const ShaderData *sd,
                                   int offset,
                                   int patch,
@@ -306,7 +308,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float2 patch_eval_float2(KernelGlobals *kg,
+ccl_device float2 patch_eval_float2(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -343,7 +345,7 @@ ccl_device float2 patch_eval_float2(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float3 patch_eval_float3(KernelGlobals *kg,
+ccl_device float3 patch_eval_float3(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -380,7 +382,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_float4(KernelGlobals *kg,
+ccl_device float4 patch_eval_float4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -417,7 +419,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_uchar4(KernelGlobals *kg,
+ccl_device float4 patch_eval_uchar4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index aeb044c9ad3..ba31b12e817 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -19,6 +19,10 @@
  * Generic functions to look up mesh, curve and volume primitive attributes for
  * shading and render passes. */
 
+#pragma once
+
+#include "kernel/kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Surface Attributes
@@ -27,8 +31,11 @@ CCL_NAMESPACE_BEGIN
  * attributes for performance, mainly for GPU performance to avoid bringing in
  * heavy volume interpolation code. */
 
-ccl_device_inline float primitive_surface_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_inline float primitive_surface_attribute_float(const KernelGlobals *kg,
+                                                          const ShaderData *sd,
+                                                          const AttributeDescriptor desc,
+                                                          float *dx,
+                                                          float *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -50,7 +57,7 @@ ccl_device_inline float primitive_surface_attribute_float(
   }
 }
 
-ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
+ccl_device_inline float2 primitive_surface_attribute_float2(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float2 *dx,
@@ -76,7 +83,7 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_surface_attribute_float3(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float3 *dx,
@@ -102,11 +109,11 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_surface_attribute_float4(KernelGlobals *kg,
-                                                            const ShaderData *sd,
-                                                            const AttributeDescriptor desc,
-                                                            float4 *dx,
-                                                            float4 *dy)
+ccl_device_forceinline float4 primitive_surface_attribute_float4(const KernelGlobals *kg,
+                                                                 const ShaderData *sd,
+                                                                 const AttributeDescriptor desc,
+                                                                 float4 *dx,
+                                                                 float4 *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -141,7 +148,7 @@ ccl_device_inline bool primitive_is_volume_attribute(const ShaderData *sd,
   return sd->type == PRIMITIVE_VOLUME;
 }
 
-ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
+ccl_device_inline float primitive_volume_attribute_float(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          const AttributeDescriptor desc)
 {
@@ -153,7 +160,7 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_volume_attribute_float3(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -165,7 +172,7 @@ ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
+ccl_device_inline float4 primitive_volume_attribute_float4(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -180,7 +187,7 @@ ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
 
 /* Default UV coordinate */
 
-ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 primitive_uv(const KernelGlobals *kg, const ShaderData *sd)
 {
   const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV);
 
@@ -193,7 +200,7 @@ ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 
 /* Ptex coordinates */
 
-ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
+ccl_device bool primitive_ptex(const KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
 {
   /* storing ptex data as attributes is not memory efficient but simple for tests */
   const AttributeDescriptor desc_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID);
@@ -213,7 +220,7 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 
 /* Surface tangent */
 
-ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 primitive_tangent(const KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
   if (sd->type & PRIMITIVE_ALL_CURVE)
@@ -245,7 +252,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 
 /* Motion vector for motion pass */
 
-ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float4 primitive_motion_vector(const KernelGlobals *kg, const ShaderData *sd)
 {
   /* center position */
   float3 center;
diff --git a/intern/cycles/kernel/geom/geom_shader_data.h b/intern/cycles/kernel/geom/geom_shader_data.h
new file mode 100644
index 00000000000..fb2cb5cb1ea
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_shader_data.h
@@ -0,0 +1,373 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Functions to initialize ShaderData given.
+ *
+ * Could be from an incoming ray, intersection or sampled position. */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* ShaderData setup from incoming ray */
+
+#ifdef __OBJECT_MOTION__
+ccl_device void shader_setup_object_transforms(const KernelGlobals *ccl_restrict kg,
+                                               ShaderData *ccl_restrict sd,
+                                               float time)
+{
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
+    sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+  }
+}
+#endif
+
+/* TODO: break this up if it helps reduce register pressure to load data from
+ * global memory as we write it to shaderdata. */
+ccl_device_inline void shader_setup_from_ray(const KernelGlobals *ccl_restrict kg,
+                                             ShaderData *ccl_restrict sd,
+                                             const Ray *ccl_restrict ray,
+                                             const Intersection *ccl_restrict isect)
+{
+  /* Read intersection data into shader globals.
+   *
+   * TODO: this is redundant, could potentially remove some of this from
+   * ShaderData but would need to ensure that it also works for shadow
+   * shader evaluation. */
+  sd->u = isect->u;
+  sd->v = isect->v;
+  sd->ray_length = isect->t;
+  sd->type = isect->type;
+  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
+                                                isect->object;
+  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
+  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+  sd->lamp = LAMP_NONE;
+  sd->flag = 0;
+
+  /* Read matrices and time. */
+  sd->time = ray->time;
+
+#ifdef __OBJECT_MOTION__
+  shader_setup_object_transforms(kg, sd, ray->time);
+#endif
+
+  /* Read ray data into shader globals. */
+  sd->I = -ray->D;
+
+#ifdef __HAIR__
+  if (sd->type & PRIMITIVE_ALL_CURVE) {
+    /* curve */
+    curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+  }
+  else
+#endif
+      if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* static triangle */
+    float3 Ng = triangle_normal(kg, sd);
+    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+
+    /* vectors */
+    sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+    sd->Ng = Ng;
+    sd->N = Ng;
+
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL)
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+#ifdef __DPDU__
+    /* dPdu/dPdv */
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+#endif
+  }
+  else {
+    /* motion triangle */
+    motion_triangle_shader_setup(
+        kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
+  }
+
+  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+
+  if (isect->object != OBJECT_NONE) {
+    /* instance transform */
+    object_normal_transform_auto(kg, sd, &sd->N);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+#ifdef __DPDU__
+    object_dir_transform_auto(kg, sd, &sd->dPdu);
+    object_dir_transform_auto(kg, sd, &sd->dPdv);
+#endif
+  }
+
+  /* backfacing test */
+  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+  if (backfacing) {
+    sd->flag |= SD_BACKFACING;
+    sd->Ng = -sd->Ng;
+    sd->N = -sd->N;
+#ifdef __DPDU__
+    sd->dPdu = -sd->dPdu;
+    sd->dPdv = -sd->dPdv;
+#endif
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  differential_transfer_compact(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, sd->ray_length);
+  differential_incoming_compact(&sd->dI, ray->D, ray->dD);
+  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+#endif
+}
+
+/* ShaderData setup from position sampled on mesh */
+
+ccl_device_inline void shader_setup_from_sample(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const float3 P,
+                                                const float3 Ng,
+                                                const float3 I,
+                                                int shader,
+                                                int object,
+                                                int prim,
+                                                float u,
+                                                float v,
+                                                float t,
+                                                float time,
+                                                bool object_space,
+                                                int lamp)
+{
+  /* vectors */
+  sd->P = P;
+  sd->N = Ng;
+  sd->Ng = Ng;
+  sd->I = I;
+  sd->shader = shader;
+  if (prim != PRIM_NONE)
+    sd->type = PRIMITIVE_TRIANGLE;
+  else if (lamp != LAMP_NONE)
+    sd->type = PRIMITIVE_LAMP;
+  else
+    sd->type = PRIMITIVE_NONE;
+
+  /* primitive */
+  sd->object = object;
+  sd->lamp = LAMP_NONE;
+  /* Currently no access to bvh prim index for strand sd->prim. */
+  sd->prim = prim;
+  sd->u = u;
+  sd->v = v;
+  sd->time = time;
+  sd->ray_length = t;
+
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  if (sd->object != OBJECT_NONE) {
+    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
+
+#ifdef __OBJECT_MOTION__
+    shader_setup_object_transforms(kg, sd, time);
+#endif
+  }
+  else if (lamp != LAMP_NONE) {
+    sd->lamp = lamp;
+  }
+
+  /* transform into world space */
+  if (object_space) {
+    object_position_transform_auto(kg, sd, &sd->P);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+    sd->N = sd->Ng;
+    object_dir_transform_auto(kg, sd, &sd->I);
+  }
+
+  if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL) {
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+        object_normal_transform_auto(kg, sd, &sd->N);
+      }
+    }
+
+    /* dPdu/dPdv */
+#ifdef __DPDU__
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+
+    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+      object_dir_transform_auto(kg, sd, &sd->dPdu);
+      object_dir_transform_auto(kg, sd, &sd->dPdv);
+    }
+#endif
+  }
+  else {
+#ifdef __DPDU__
+    sd->dPdu = zero_float3();
+    sd->dPdv = zero_float3();
+#endif
+  }
+
+  /* backfacing test */
+  if (sd->prim != PRIM_NONE) {
+    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+    if (backfacing) {
+      sd->flag |= SD_BACKFACING;
+      sd->Ng = -sd->Ng;
+      sd->N = -sd->N;
+#ifdef __DPDU__
+      sd->dPdu = -sd->dPdu;
+      sd->dPdv = -sd->dPdv;
+#endif
+    }
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* no ray differentials here yet */
+  sd->dP = differential3_zero();
+  sd->dI = differential3_zero();
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup for displacement */
+
+ccl_device void shader_setup_from_displace(const KernelGlobals *ccl_restrict kg,
+                                           ShaderData *ccl_restrict sd,
+                                           int object,
+                                           int prim,
+                                           float u,
+                                           float v)
+{
+  float3 P, Ng, I = zero_float3();
+  int shader;
+
+  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
+
+  /* force smooth shading for displacement */
+  shader |= SHADER_SMOOTH_NORMAL;
+
+  shader_setup_from_sample(
+      kg,
+      sd,
+      P,
+      Ng,
+      I,
+      shader,
+      object,
+      prim,
+      u,
+      v,
+      0.0f,
+      0.5f,
+      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
+      LAMP_NONE);
+}
+
+/* ShaderData setup from ray into background */
+
+ccl_device_inline void shader_setup_from_background(const KernelGlobals *ccl_restrict kg,
+                                                    ShaderData *ccl_restrict sd,
+                                                    const float3 ray_P,
+                                                    const float3 ray_D,
+                                                    const float ray_time)
+{
+  /* for NDC coordinates */
+  sd->ray_P = ray_P;
+
+  /* vectors */
+  sd->P = ray_D;
+  sd->N = -ray_D;
+  sd->Ng = -ray_D;
+  sd->I = -ray_D;
+  sd->shader = kernel_data.background.surface_shader;
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  sd->time = ray_time;
+  sd->ray_length = 0.0f;
+
+  sd->object = OBJECT_NONE;
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#endif
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO: ray->dP */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup from point inside volume */
+
+#ifdef __VOLUME__
+ccl_device_inline void shader_setup_from_volume(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const Ray *ccl_restrict ray)
+{
+
+  /* vectors */
+  sd->P = ray->P;
+  sd->N = -ray->D;
+  sd->Ng = -ray->D;
+  sd->I = -ray->D;
+  sd->shader = SHADER_NONE;
+  sd->flag = 0;
+  sd->object_flag = 0;
+  sd->time = ray->time;
+  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
+
+  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->type = PRIMITIVE_VOLUME;
+
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#  ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#  endif
+
+#  ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO ray->dD */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#  endif
+
+  /* for NDC coordinates */
+  sd->ray_P = ray->P;
+  sd->ray_dP = ray->dP;
+}
+#endif /* __VOLUME__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 9eceb996926..877b2ece15b 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -16,18 +16,20 @@
 
 /* Functions for retrieving attributes on triangles produced from subdivision meshes */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Patch index for triangle, -1 if not subdivision triangle */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd)
 {
   return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
-ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
+ccl_device_inline void subd_triangle_patch_uv(const KernelGlobals *kg,
                                               const ShaderData *sd,
                                               float2 uv[3])
 {
@@ -40,7 +42,7 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
 
 /* Vertex indices of patch */
 
-ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch)
+ccl_device_inline uint4 subd_triangle_patch_indices(const KernelGlobals *kg, int patch)
 {
   uint4 indices;
 
@@ -54,21 +56,23 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch
 
 /* Originating face for patch */
 
-ccl_device_inline uint subd_triangle_patch_face(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_face(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 4);
 }
 
 /* Number of corners on originating face */
 
-ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_num_corners(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 5) & 0xffff;
 }
 
 /* Indices of the four corners that are used by the patch */
 
-ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, int corners[4])
+ccl_device_inline void subd_triangle_patch_corners(const KernelGlobals *kg,
+                                                   int patch,
+                                                   int corners[4])
 {
   uint4 data;
 
@@ -99,8 +103,11 @@ ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch,
 
 /* Reading attributes on various subdivision triangle elements */
 
-ccl_device_noinline float subd_triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_noinline float subd_triangle_attribute_float(const KernelGlobals *kg,
+                                                        const ShaderData *sd,
+                                                        const AttributeDescriptor desc,
+                                                        float *dx,
+                                                        float *dy)
 {
   int patch = subd_triangle_patch(kg, sd);
 
@@ -235,7 +242,7 @@ ccl_device_noinline float subd_triangle_attribute_float(
   }
 }
 
-ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
+ccl_device_noinline float2 subd_triangle_attribute_float2(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float2 *dx,
@@ -378,7 +385,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
+ccl_device_noinline float3 subd_triangle_attribute_float3(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float3 *dx,
@@ -520,7 +527,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg,
+ccl_device_noinline float4 subd_triangle_attribute_float4(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index ff7909ca425..910fb122c6d 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -20,10 +20,12 @@
  * ray intersection we use a precomputed triangle storage to accelerate
  * intersection at the cost of more memory usage */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Normal on triangle. */
-ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 triangle_normal(const KernelGlobals *kg, ShaderData *sd)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
@@ -41,8 +43,14 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 }
 
 /* Point and normal on triangle. */
-ccl_device_inline void triangle_point_normal(
-    KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(const KernelGlobals *kg,
+                                             int object,
+                                             int prim,
+                                             float u,
+                                             float v,
+                                             float3 *P,
+                                             float3 *Ng,
+                                             int *shader)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -67,7 +75,7 @@ ccl_device_inline void triangle_point_normal(
 
 /* Triangle vertex locations */
 
-ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
+ccl_device_inline void triangle_vertices(const KernelGlobals *kg, int prim, float3 P[3])
 {
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
   P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w + 0));
@@ -77,7 +85,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Triangle vertex locations and vertex normals */
 
-ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
+ccl_device_inline void triangle_vertices_and_normals(const KernelGlobals *kg,
                                                      int prim,
                                                      float3 P[3],
                                                      float3 N[3])
@@ -94,7 +102,7 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
 /* Interpolate smooth vertex normal from vertices */
 
 ccl_device_inline float3
-triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
+triangle_smooth_normal(const KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -108,7 +116,7 @@ triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 }
 
 ccl_device_inline float3 triangle_smooth_normal_unnormalized(
-    KernelGlobals *kg, ShaderData *sd, float3 Ng, int prim, float u, float v)
+    const KernelGlobals *kg, const ShaderData *sd, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -130,7 +138,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
+ccl_device_inline void triangle_dPdudv(const KernelGlobals *kg,
                                        int prim,
                                        ccl_addr_space float3 *dPdu,
                                        ccl_addr_space float3 *dPdv)
@@ -148,8 +156,11 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
 
 /* Reading attributes on various triangle elements */
 
-ccl_device float triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float triangle_attribute_float(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const AttributeDescriptor desc,
+                                          float *dx,
+                                          float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION | ATTR_ELEMENT_CORNER)) {
     float f0, f1, f2;
@@ -195,7 +206,7 @@ ccl_device float triangle_attribute_float(
   }
 }
 
-ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
+ccl_device float2 triangle_attribute_float2(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float2 *dx,
@@ -245,7 +256,7 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
+ccl_device float3 triangle_attribute_float3(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float3 *dx,
@@ -295,7 +306,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
+ccl_device float4 triangle_attribute_float4(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index b0cce274b94..30b77ebd2eb 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -20,12 +20,17 @@
  * intersection at the cost of more memory usage.
  */
 
+#pragma once
+
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect(const KernelGlobals *kg,
                                           Intersection *isect,
                                           float3 P,
                                           float3 dir,
+                                          float tmax,
                                           uint visibility,
                                           int object,
                                           int prim_addr)
@@ -41,7 +46,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              ssef_verts,
 #else
@@ -78,7 +83,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
  */
 
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect_local(const KernelGlobals *kg,
                                                 LocalIntersection *local_isect,
                                                 float3 P,
                                                 float3 dir,
@@ -192,25 +197,20 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
  * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
  */
 
-ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine(const KernelGlobals *kg,
                                          ShaderData *sd,
-                                         const Intersection *isect,
-                                         const Ray *ray)
+                                         float3 P,
+                                         float3 D,
+                                         float t,
+                                         const int isect_object,
+                                         const int isect_prim)
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -219,7 +219,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
   P = P + D * t;
 
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -239,13 +239,8 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
     P = P + D * rt;
   }
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -255,28 +250,23 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space for
+/* Same as above, except that t is assumed to be in object space for
  * instancing.
  */
-ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine_local(const KernelGlobals *kg,
                                                ShaderData *sd,
-                                               const Intersection *isect,
-                                               const Ray *ray)
+                                               float3 P,
+                                               float3 D,
+                                               float t,
+                                               const int isect_object,
+                                               const int isect_prim)
 {
 #ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return triangle_refine(kg, sd, isect, ray);
+  /* t is always in world space with OptiX. */
+  return triangle_refine(kg, sd, P, D, t, isect_object, isect_prim);
 #else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -286,7 +276,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   P = P + D * t;
 
 #  ifdef __INTERSECTION_REFINE__
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -307,13 +297,8 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   }
 #  endif /* __INTERSECTION_REFINE__ */
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 809b76245ba..2bcd7e56b5f 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -23,13 +23,15 @@
  * 3D voxel textures can be assigned as attributes per mesh, which means the
  * same shader can be used for volume objects with different densities, etc. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME__
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
+ccl_device_inline float3 volume_normalized_position(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 P)
 {
@@ -68,7 +70,7 @@ ccl_device float3 volume_attribute_value_to_float3(const float4 value)
   }
 }
 
-ccl_device float4 volume_attribute_float4(KernelGlobals *kg,
+ccl_device float4 volume_attribute_float4(const KernelGlobals *kg,
                                           const ShaderData *sd,
                                           const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
new file mode 100644
index 00000000000..4898ff936c6
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/geom/geom.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* This helps with AA but it's not the real solution as it does not AA the geometry
+ * but it's better than nothing, thus committed. */
+ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
+{
+  /* use mirror repeat (like opengl texture) so that if the barycentric
+   * coordinate goes past the end of the triangle it is not always clamped
+   * to the same value, gives ugly patterns */
+  u /= max;
+  float fu = floorf(u);
+  u = u - fu;
+
+  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_bake(INTEGRATOR_STATE_ARGS,
+                                          const ccl_global KernelWorkTile *ccl_restrict tile,
+                                          ccl_global float *render_buffer,
+                                          const int x,
+                                          const int y,
+                                          const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Always count the sample, even if the camera sample will reject the ray. */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Setup render buffers. */
+  const int index = INTEGRATOR_STATE(path, render_pixel_index);
+  const int pass_stride = kernel_data.film.pass_stride;
+  render_buffer += index * pass_stride;
+
+  ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential;
+
+  const int seed = __float_as_uint(primitive[0]);
+  int prim = __float_as_uint(primitive[1]);
+  if (prim == -1) {
+    return false;
+  }
+
+  prim += kernel_data.bake.tri_offset;
+
+  /* Random number generator. */
+  const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
+
+  float filter_x, filter_y;
+  if (sample == 0) {
+    filter_x = filter_y = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Barycentric UV with sub-pixel offset. */
+  float u = primitive[2];
+  float v = primitive[3];
+
+  float dudx = differential[0];
+  float dudy = differential[1];
+  float dvdx = differential[2];
+  float dvdy = differential[3];
+
+  if (sample > 0) {
+    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
+    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
+                                 1.0f - u);
+  }
+
+  /* Position and normal on triangle. */
+  float3 P, Ng;
+  int shader;
+  triangle_point_normal(kg, kernel_data.bake.object_index, prim, u, v, &P, &Ng, &shader);
+  shader &= SHADER_MASK;
+
+  if (kernel_data.film.pass_background != PASS_UNUSED) {
+    /* Environment baking. */
+
+    /* Setup and write ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = zero_float3();
+    ray.D = normalize(P);
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup next kernel to execute. */
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  }
+  else {
+    /* Surface baking. */
+
+    /* Setup ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = P + Ng;
+    ray.D = -Ng;
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+
+    /* Setup differentials. */
+    float3 dPdu, dPdv;
+    triangle_dPdudv(kg, prim, &dPdu, &dPdv);
+    differential3 dP;
+    dP.dx = dPdu * dudx + dPdv * dvdx;
+    dP.dy = dPdu * dudy + dPdv * dvdy;
+    ray.dP = differential_make_compact(dP);
+    ray.dD = differential_zero_compact();
+
+    /* Write ray. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup and write intersection. */
+    Intersection isect ccl_optional_struct_init;
+    isect.object = kernel_data.bake.object_index;
+    isect.prim = prim;
+    isect.u = u;
+    isect.v = v;
+    isect.t = 1.0f;
+    isect.type = PRIMITIVE_TRIANGLE;
+#ifdef __EMBREE__
+    isect.Ng = Ng;
+#endif
+    integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+    /* Setup next kernel to execute. */
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_camera.h b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
new file mode 100644
index 00000000000..58e7bde4c94
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_camera_sample(const KernelGlobals *ccl_restrict kg,
+                                               const int sample,
+                                               const int x,
+                                               const int y,
+                                               const uint rng_hash,
+                                               Ray *ray)
+{
+  /* Filter sampling. */
+  float filter_u, filter_v;
+
+  if (sample == 0) {
+    filter_u = 0.5f;
+    filter_v = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v);
+  }
+
+  /* Depth of field sampling. */
+  float lens_u = 0.0f, lens_v = 0.0f;
+  if (kernel_data.cam.aperturesize > 0.0f) {
+    path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v);
+  }
+
+  /* Motion blur time sampling. */
+  float time = 0.0f;
+#ifdef __CAMERA_MOTION__
+  if (kernel_data.cam.shuttertime != -1.0f)
+    time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME);
+#endif
+
+  /* Generate camera ray. */
+  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_camera(INTEGRATOR_STATE_ARGS,
+                                            const ccl_global KernelWorkTile *ccl_restrict tile,
+                                            ccl_global float *render_buffer,
+                                            const int x,
+                                            const int y,
+                                            const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Count the sample and get an effective sample for this pixel.
+   *
+   * This logic allows to both count actual number of samples per pixel, and to add samples to this
+   * pixel after it was converged and samples were added somewhere else (in which case the
+   * `scheduled_sample` will be different from actual number of samples in this pixel). */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Initialize random number seed for path. */
+  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
+
+  {
+    /* Generate camera ray. */
+    Ray ray;
+    integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
+    if (ray.t == 0.0f) {
+      return true;
+    }
+
+    /* Write camera ray to state. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Continue with intersect_closest kernel, optionally initializing volume
+   * stack before that if the camera may be inside a volume. */
+  if (kernel_data.cam.is_inside_volume) {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+  }
+  else {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
new file mode 100644
index 00000000000..34ca6814534
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/bvh/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+template<uint32_t current_kernel>
+ccl_device_forceinline bool integrator_intersect_terminate(INTEGRATOR_STATE_ARGS,
+                                                           const int shader_flags)
+{
+
+  /* Optional AO bounce termination.
+   * We continue evaluating emissive/transparent surfaces and volumes, similar
+   * to direct lighting. Only if we know there are none can we terminate the
+   * path immediately. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    if (shader_flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    }
+    else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_VOLUME;
+    }
+    else {
+      return true;
+    }
+  }
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* We perform path termination in this kernel to avoid launching shade_surface
+   * and evaluating the shader when not needed. Only for emission and transparent
+   * surfaces in front of emission do we need to evaluate the shader, since we
+   * perform MIS as part of indirect rays. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = path_state_continuation_probability(INTEGRATOR_STATE_PASS, path_flag);
+
+  if (probability != 1.0f) {
+    const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
+
+    if (probability == 0.0f || terminate >= probability) {
+      if (shader_flags & SD_HAS_EMISSION) {
+        /* Mark path to be terminated right after shader evaluation on the surface. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
+      }
+      else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        /* TODO: only do this for emissive volumes. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IN_NEXT_VOLUME;
+      }
+      else {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/* Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_shader_next_kernel(
+    INTEGRATOR_STATE_ARGS,
+    const Intersection *ccl_restrict isect,
+    const int shader,
+    const int shader_flags)
+{
+  /* Note on scheduling.
+   *
+   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
+   * or without raytrace support, depending on the shader used.
+   *
+   * When there is a shadow catcher split the general idea is to have the following configuration:
+   *
+   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
+   *    will trace shadow catcher object.
+   *
+   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
+   *    the matte ray.
+   *
+   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
+   *    on. The background kernel will then schedule surface shading for the matte ray.
+   *
+   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
+   * the matte path. */
+
+  const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
+                                    (kernel_data.film.pass_ao != PASS_UNUSED));
+
+  if (use_raytrace_kernel) {
+    INTEGRATOR_PATH_NEXT_SORTED(
+        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+  }
+
+#ifdef __SHADOW_CATCHER__
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags)) {
+    if (kernel_data.film.use_approximate_shadow_catcher && !kernel_data.background.transparent) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+      if (use_raytrace_kernel) {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+      else {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+    }
+    else if (use_raytrace_kernel) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    }
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+  kernel_assert(ray.t != 0.0f);
+
+  const uint visibility = path_state_ray_visibility(INTEGRATOR_STATE_PASS);
+  const int last_isect_prim = INTEGRATOR_STATE(isect, prim);
+  const int last_isect_object = INTEGRATOR_STATE(isect, object);
+
+  /* Trick to use short AO rays to approximate indirect light at the end of the path. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+
+    const int last_object = last_isect_object != OBJECT_NONE ?
+                                last_isect_object :
+                                kernel_tex_fetch(__prim_object, last_isect_prim);
+    const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
+    if (object_ao_distance != 0.0f) {
+      ray.t = object_ao_distance;
+    }
+  }
+
+  /* Scene Intersection. */
+  Intersection isect ccl_optional_struct_init;
+  bool hit = scene_intersect(kg, &ray, visibility, &isect);
+
+  /* TODO: remove this and do it in the various intersection functions instead. */
+  if (!hit) {
+    isect.prim = PRIM_NONE;
+  }
+
+  /* Light intersection for MIS. */
+  if (kernel_data.integrator.use_lamp_mis) {
+    /* NOTE: if we make lights visible to camera rays, we'll need to initialize
+     * these in the path_state_init. */
+    const int last_type = INTEGRATOR_STATE(isect, type);
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+
+    hit = lights_intersect(
+              kg, &ray, &isect, last_isect_prim, last_isect_object, last_type, path_flag) ||
+          hit;
+  }
+
+  /* Write intersection result into global integrator state memory. */
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, flags)) {
+      /* Continue with volume kernel if we are inside a volume, regardless
+       * if we hit anything. */
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    }
+    else {
+      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+    return;
+  }
+#endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+              INTEGRATOR_STATE_PASS, flags)) {
+        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, &isect, shader, flags);
+        return;
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+        return;
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
new file mode 100644
index 00000000000..5bd9cfda4a4
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Visibility for the shadow ray. */
+ccl_device_forceinline uint integrate_intersect_shadow_visibility(INTEGRATOR_STATE_CONST_ARGS)
+{
+  uint visibility = PATH_RAY_SHADOW;
+
+#ifdef __SHADOW_CATCHER__
+  const uint32_t path_flag = INTEGRATOR_STATE(shadow_path, flag);
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+#endif
+
+  return visibility;
+}
+
+ccl_device bool integrate_intersect_shadow_opaque(INTEGRATOR_STATE_ARGS,
+                                                  const Ray *ray,
+                                                  const uint visibility)
+{
+  /* Mask which will pick only opaque visibility bits from the `visibility`.
+   * Calculate the mask at compile time: the visibility will either be a high bits for the shadow
+   * catcher objects, or lower bits for the regular objects (there is no need to check the path
+   * state here again). */
+  constexpr const uint opaque_mask = SHADOW_CATCHER_VISIBILITY_SHIFT(PATH_RAY_SHADOW_OPAQUE) |
+                                     PATH_RAY_SHADOW_OPAQUE;
+
+  Intersection isect;
+  const bool opaque_hit = scene_intersect(kg, ray, visibility & opaque_mask, &isect);
+
+  if (!opaque_hit) {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+
+ccl_device_forceinline int integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_CONST_ARGS)
+{
+  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+  const int transparent_bounce = INTEGRATOR_STATE(shadow_path, transparent_bounce);
+
+  return max(transparent_max_bounce - transparent_bounce - 1, 0);
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ray,
+                                                       const uint visibility)
+{
+  Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE];
+
+  /* Limit the number hits to the max transparent bounces allowed and the size that we
+   * have available in the integrator state. */
+  const uint max_transparent_hits = integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_PASS);
+  const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE);
+  uint num_hits = 0;
+  bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits);
+
+  /* If number of hits exceed the transparent bounces limit, make opaque. */
+  if (num_hits > max_transparent_hits) {
+    opaque_hit = true;
+  }
+
+  if (!opaque_hit) {
+    uint num_recorded_hits = min(num_hits, max_hits);
+
+    if (num_recorded_hits > 0) {
+      sort_intersections(isect, num_recorded_hits);
+
+      /* Write intersection result into global integrator state memory. */
+      for (int hit = 0; hit < num_recorded_hits; hit++) {
+        integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit);
+      }
+    }
+
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = num_hits;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+#endif
+
+ccl_device void integrator_intersect_shadow(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Compute visibility. */
+  const uint visibility = integrate_intersect_shadow_visibility(INTEGRATOR_STATE_PASS);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* TODO: compile different kernels depending on this? Especially for OptiX
+   * conditional trace calls are bad. */
+  const bool opaque_hit =
+      (kernel_data.integrator.transparent_shadows) ?
+          integrate_intersect_shadow_transparent(INTEGRATOR_STATE_PASS, &ray, visibility) :
+          integrate_intersect_shadow_opaque(INTEGRATOR_STATE_PASS, &ray, visibility);
+#else
+  const bool opaque_hit = integrate_intersect_shadow_opaque(
+      INTEGRATOR_STATE_PASS, &ray, visibility);
+#endif
+
+  if (opaque_hit) {
+    /* Hit an opaque surface, shadow path ends here. */
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    /* Hit nothing or transparent surfaces, continue to shadow kernel
+     * for shading and render buffer output.
+     *
+     * TODO: could also write to render buffer directly if no transparent shadows?
+     * Could save a kernel execution for the common case. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
index c10ecc426c6..7c090952dc7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
+++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,23 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
+#pragma once
 
-__kernel void kernel_ocl_path_trace_state_buffer_size(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        uint num_threads,
-        ccl_global uint64_t *size)
+#include "kernel/integrator/integrator_subsurface.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS)
 {
-	((KernelGlobals*)kg)->data = data;
-	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SUBSURFACE);
+
+#ifdef __SUBSURFACE__
+  if (subsurface_scatter(INTEGRATOR_STATE_PASS)) {
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
new file mode 100644
index 00000000000..60d8a8e3e54
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_ARGS,
+                                                              const float3 from_P,
+                                                              const float3 to_P)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  kernel_assert(kernel_data.integrator.use_volumes);
+
+  Ray volume_ray ccl_optional_struct_init;
+  volume_ray.P = from_P;
+  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+  if (num_hits > 0) {
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+    }
+  }
+#else
+  Intersection isect;
+  int step = 0;
+  while (step < 2 * VOLUME_STACK_SIZE &&
+         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    if (volume_ray.t != FLT_MAX) {
+      volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
+    }
+    ++step;
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  Ray volume_ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &volume_ray);
+  volume_ray.t = FLT_MAX;
+
+  const uint visibility = (INTEGRATOR_STATE(path, flag) & PATH_RAY_ALL_VISIBILITY);
+  int stack_index = 0, enclosed_index = 0;
+
+  /* Write background shader. */
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
+    integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+    stack_index++;
+  }
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+  if (num_hits > 0) {
+    int enclosed_volumes[VOLUME_STACK_SIZE];
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      if (stack_sd->flag & SD_BACKFACING) {
+        bool need_add = true;
+        for (int i = 0; i < enclosed_index && need_add; ++i) {
+          /* If ray exited the volume and never entered to that volume
+           * it means that camera is inside such a volume.
+           */
+          if (enclosed_volumes[i] == stack_sd->object) {
+            need_add = false;
+          }
+        }
+        for (int i = 0; i < stack_index && need_add; ++i) {
+          /* Don't add intersections twice. */
+          VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+          if (entry.object == stack_sd->object) {
+            need_add = false;
+            break;
+          }
+        }
+        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+          const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+          integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+          ++stack_index;
+        }
+      }
+      else {
+        /* If ray from camera enters the volume, this volume shouldn't
+         * be added to the stack on exit.
+         */
+        enclosed_volumes[enclosed_index++] = stack_sd->object;
+      }
+    }
+  }
+#else
+  int enclosed_volumes[VOLUME_STACK_SIZE];
+  int step = 0;
+
+  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
+         step < 2 * VOLUME_STACK_SIZE) {
+    Intersection isect;
+    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
+      break;
+    }
+
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    if (stack_sd->flag & SD_BACKFACING) {
+      /* If ray exited the volume and never entered to that volume
+       * it means that camera is inside such a volume.
+       */
+      bool need_add = true;
+      for (int i = 0; i < enclosed_index && need_add; ++i) {
+        /* If ray exited the volume and never entered to that volume
+         * it means that camera is inside such a volume.
+         */
+        if (enclosed_volumes[i] == stack_sd->object) {
+          need_add = false;
+        }
+      }
+      for (int i = 0; i < stack_index && need_add; ++i) {
+        /* Don't add intersections twice. */
+        VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+        if (entry.object == stack_sd->object) {
+          need_add = false;
+          break;
+        }
+      }
+      if (need_add) {
+        const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+        ++stack_index;
+      }
+    }
+    else {
+      /* If ray from camera enters the volume, this volume shouldn't
+       * be added to the stack on exit.
+       */
+      enclosed_volumes[enclosed_index++] = stack_sd->object;
+    }
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    ++step;
+  }
+#endif
+
+  /* Write terminator. */
+  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
+  integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+
+  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
new file mode 100644
index 00000000000..91363ea1c7f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_megakernel(INTEGRATOR_STATE_ARGS,
+                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Each kernel indicates the next kernel to execute, so here we simply
+   * have to check what that kernel is and execute it.
+   *
+   * TODO: investigate if we can use device side enqueue for GPUs to avoid
+   * having to compile this big kernel. */
+  while (true) {
+    if (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+      /* First handle any shadow paths before we potentially create more shadow paths. */
+      switch (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+          integrator_intersect_shadow(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+          integrator_shade_shadow(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else if (INTEGRATOR_STATE(path, queued_kernel)) {
+      /* Then handle regular path kernels. */
+      switch (INTEGRATOR_STATE(path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+          integrator_intersect_closest(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+          integrator_shade_background(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+          integrator_shade_surface(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+          integrator_shade_volume(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+          integrator_shade_surface_raytrace(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+          integrator_shade_light(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+          integrator_intersect_subsurface(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+          integrator_intersect_volume_stack(INTEGRATOR_STATE_PASS);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else {
+      break;
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
new file mode 100644
index 00000000000..3e4cc837e9b
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float3 integrator_eval_background_shader(INTEGRATOR_STATE_ARGS,
+                                                    ccl_global float *ccl_restrict render_buffer)
+{
+#ifdef __BACKGROUND__
+  const int shader = kernel_data.background.surface_shader;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Use visibility flag to skip lights. */
+  if (shader & SHADER_EXCLUDE_ANY) {
+    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((shader & SHADER_EXCLUDE_GLOSSY) && ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+                                              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+        ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return zero_float3();
+  }
+
+  /* Use fast constant background color if available. */
+  float3 L = zero_float3();
+  if (!shader_constant_emission_eval(kg, shader, &L)) {
+    /* Evaluate background shader. */
+
+    /* TODO: does aliasing like this break automatic SoA in CUDA?
+     * Should we instead store closures separate from ShaderData? */
+    ShaderDataTinyStorage emission_sd_storage;
+    ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
+    shader_setup_from_background(kg,
+                                 emission_sd,
+                                 INTEGRATOR_STATE(ray, P),
+                                 INTEGRATOR_STATE(ray, D),
+                                 INTEGRATOR_STATE(ray, time));
+
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
+
+    L = shader_background_eval(emission_sd);
+  }
+
+  /* Background MIS weights. */
+#  ifdef __BACKGROUND_MIS__
+  /* Check if background light exists or if we should skip pdf. */
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
+    const float3 ray_P = INTEGRATOR_STATE(ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(ray, D);
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* multiple importance sampling, get background light pdf for ray
+     * direction, and compute weight with respect to BSDF pdf */
+    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
+    const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
+
+    L *= mis_weight;
+  }
+#  endif
+
+  return L;
+#else
+  return make_float3(0.8f, 0.8f, 0.8f);
+#endif
+}
+
+ccl_device_inline void integrate_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  /* Accumulate transparency for transparent background. We can skip background
+   * shader evaluation unless a background pass is used. */
+  bool eval_background = true;
+  float transparent = 0.0f;
+
+  const bool is_transparent_background_ray = kernel_data.background.transparent &&
+                                             (INTEGRATOR_STATE(path, flag) &
+                                              PATH_RAY_TRANSPARENT_BACKGROUND);
+
+  if (is_transparent_background_ray) {
+    transparent = average(INTEGRATOR_STATE(path, throughput));
+
+#ifdef __PASSES__
+    eval_background = (kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND));
+#else
+    eval_background = false;
+#endif
+  }
+
+  /* Evaluate background shader. */
+  float3 L = (eval_background) ?
+                 integrator_eval_background_shader(INTEGRATOR_STATE_PASS, render_buffer) :
+                 zero_float3();
+
+  /* When using the ao bounces approximation, adjust background
+   * shader intensity with ao factor. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    L *= kernel_data.integrator.ao_bounces_factor;
+  }
+
+  /* Write to render buffer. */
+  kernel_accum_background(
+      INTEGRATOR_STATE_PASS, L, transparent, is_transparent_background_ray, render_buffer);
+}
+
+ccl_device_inline void integrate_distant_lights(INTEGRATOR_STATE_ARGS,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+  LightSample ls ccl_optional_struct_init;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    if (light_sample_from_distant_ray(kg, ray_D, lamp, &ls)) {
+      /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+      const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+      if (ls.shader & SHADER_EXCLUDE_ANY) {
+        if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+            ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+             ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+            ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+            ((ls.shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+            ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+          return;
+      }
+#endif
+
+      /* Evaluate light shader. */
+      /* TODO: does aliasing like this break automatic SoA in CUDA? */
+      ShaderDataTinyStorage emission_sd_storage;
+      ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+      float3 light_eval = light_sample_shader_eval(
+          INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+      if (is_zero(light_eval)) {
+        return;
+      }
+
+      /* MIS weighting. */
+      if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+        /* multiple importance sampling, get regular light pdf,
+         * and compute weight with respect to BSDF pdf */
+        const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+        const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+        light_eval *= mis_weight;
+      }
+
+      /* Write to render buffer. */
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+    }
+  }
+}
+
+ccl_device void integrator_shade_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  /* TODO: unify these in a single loop to only have a single shader evaluation call. */
+  integrate_distant_lights(INTEGRATOR_STATE_PASS, render_buffer);
+  integrate_background(INTEGRATOR_STATE_PASS, render_buffer);
+
+#ifdef __SHADOW_CATCHER__
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+    const int isect_prim = INTEGRATOR_STATE(isect, prim);
+    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim);
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                  shader);
+    }
+    else {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                  shader);
+    }
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_light.h b/intern/cycles/kernel/integrator/integrator_shade_light.h
new file mode 100644
index 00000000000..05b530f9665
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_light.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  /* Setup light sample. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  float3 ray_P = INTEGRATOR_STATE(ray, P);
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+
+  /* Advance ray beyond light. */
+  /* TODO: can we make this more numerically robust to avoid reintersecting the
+   * same light in some cases? */
+  const float3 new_ray_P = ray_offset(ray_P + ray_D * isect.t, ray_D);
+  INTEGRATOR_STATE_WRITE(ray, P) = new_ray_P;
+  INTEGRATOR_STATE_WRITE(ray, t) -= isect.t;
+
+  /* Set position to where the BSDF was sampled, for correct MIS PDF. */
+  const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+  ray_P -= ray_D * mis_ray_t;
+  isect.t += mis_ray_t;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = mis_ray_t + isect.t;
+
+  LightSample ls ccl_optional_struct_init;
+  const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
+
+  if (!use_light_sample) {
+    return;
+  }
+
+  /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (ls.shader & SHADER_EXCLUDE_ANY) {
+    if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+         ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return;
+  }
+#endif
+
+  /* Evaluate light shader. */
+  /* TODO: does aliasing like this break automatic SoA in CUDA? */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  float3 light_eval = light_sample_shader_eval(INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* MIS weighting. */
+  if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+    /* multiple importance sampling, get regular light pdf,
+     * and compute weight with respect to BSDF pdf */
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+    light_eval *= mis_weight;
+  }
+
+  /* Write to render buffer. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+}
+
+ccl_device void integrator_shade_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  integrate_light(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* TODO: we could get stuck in an infinite loop if there are precision issues
+   * and the same light is hit again.
+   *
+   * As a workaround count this as a transparent bounce. It makes some sense
+   * to interpret lights as transparent surfaces (and support making them opaque),
+   * but this needs to be revisited. */
+  uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+
+  if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    return;
+  }
+  else {
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+
+  /* TODO: in some cases we could continue directly to SHADE_BACKGROUND, but
+   * probably that optimization is probably not practical if we add lights to
+   * scene geometry. */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
new file mode 100644
index 00000000000..fd3c3ae1653
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_shade_volume.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits)
+{
+  return num_hits >= INTEGRATOR_SHADOW_ISECT_SIZE;
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device_inline float3 integrate_transparent_surface_shadow(INTEGRATOR_STATE_ARGS, const int hit)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
+
+  /* TODO: does aliasing like this break automatic SoA in CUDA?
+   * Should we instead store closures separate from ShaderData?
+   *
+   * TODO: is it better to declare this outside the loop or keep it local
+   * so the compiler can see there is no dependency between iterations? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data at surface. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_shadow_isect(INTEGRATOR_STATE_PASS, &isect, hit);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, shadow_sd, &ray, &isect);
+
+  /* Evaluate shader. */
+  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+        INTEGRATOR_STATE_PASS, shadow_sd, NULL, PATH_RAY_SHADOW);
+  }
+
+#  ifdef __VOLUME__
+  /* Exit/enter volume. */
+  shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, shadow_sd);
+#  endif
+
+  /* Compute transparency from closures. */
+  return shader_bsdf_transparency(kg, shadow_sd);
+}
+
+#  ifdef __VOLUME__
+ccl_device_inline void integrate_transparent_volume_shadow(INTEGRATOR_STATE_ARGS,
+                                                           const int hit,
+                                                           const int num_recorded_hits,
+                                                           float3 *ccl_restrict throughput)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME);
+
+  /* TODO: deduplicate with surface, or does it not matter for memory usage? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Modify ray position and length to match current segment. */
+  const float start_t = (hit == 0) ? 0.0f : INTEGRATOR_STATE_ARRAY(shadow_isect, hit - 1, t);
+  const float end_t = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(shadow_isect, hit, t) :
+                                                  ray.t;
+  ray.P += start_t * ray.D;
+  ray.t = end_t - start_t;
+
+  shader_setup_from_volume(kg, shadow_sd, &ray);
+
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  volume_shadow_heterogeneous(INTEGRATOR_STATE_PASS, &ray, shadow_sd, throughput, step_size);
+}
+#  endif
+
+ccl_device_inline bool integrate_transparent_shadow(INTEGRATOR_STATE_ARGS, const int num_hits)
+{
+  /* Accumulate shadow for transparent surfaces. */
+  const int num_recorded_hits = min(num_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+
+  for (int hit = 0; hit < num_recorded_hits + 1; hit++) {
+    /* Volume shaders. */
+    if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) {
+#  ifdef __VOLUME__
+      if (!integrator_state_shadow_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        float3 throughput = INTEGRATOR_STATE(shadow_path, throughput);
+        integrate_transparent_volume_shadow(
+            INTEGRATOR_STATE_PASS, hit, num_recorded_hits, &throughput);
+        if (is_zero(throughput)) {
+          return true;
+        }
+
+        INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      }
+#  endif
+    }
+
+    /* Surface shaders. */
+    if (hit < num_recorded_hits) {
+      const float3 shadow = integrate_transparent_surface_shadow(INTEGRATOR_STATE_PASS, hit);
+      const float3 throughput = INTEGRATOR_STATE(shadow_path, throughput) * shadow;
+      if (is_zero(throughput)) {
+        return true;
+      }
+
+      INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) += 1;
+    }
+
+    /* Note we do not need to check max_transparent_bounce here, the number
+     * of intersections is already limited and made opaque in the
+     * INTERSECT_SHADOW kernel. */
+  }
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* There are more hits that we could not recorded due to memory usage,
+     * adjust ray to intersect again from the last hit. */
+    const float last_hit_t = INTEGRATOR_STATE_ARRAY(shadow_isect, num_recorded_hits - 1, t);
+    const float3 ray_P = INTEGRATOR_STATE(shadow_ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(shadow_ray, D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray_offset(ray_P + last_hit_t * ray_D, ray_D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, t) -= last_hit_t;
+  }
+
+  return false;
+}
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device void integrator_shade_shadow(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP);
+  const int num_hits = INTEGRATOR_STATE(shadow_path, num_hits);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* Evaluate transparent shadows. */
+  const bool opaque = integrate_transparent_shadow(INTEGRATOR_STATE_PASS, num_hits);
+  if (opaque) {
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+#endif
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* More intersections to find, continue shadow ray. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    kernel_accum_light(INTEGRATOR_STATE_PASS, render_buffer);
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
new file mode 100644
index 00000000000..73b7cad32be
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_subsurface.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline void integrate_surface_shader_setup(INTEGRATOR_STATE_CONST_ARGS,
+                                                           ShaderData *sd)
+{
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, sd, &ray, &isect);
+}
+
+#ifdef __HOLDOUT__
+ccl_device_forceinline bool integrate_surface_holdout(INTEGRATOR_STATE_CONST_ARGS,
+                                                      ShaderData *sd,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Write holdout transparency to render buffer and stop if fully holdout. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+      (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+    const float3 holdout_weight = shader_holdout_apply(kg, sd);
+    if (kernel_data.background.transparent) {
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      const float transparent = average(holdout_weight * throughput);
+      kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+    }
+    if (isequal_float3(holdout_weight, one_float3())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+#endif /* __HOLDOUT__ */
+
+#ifdef __EMISSION__
+ccl_device_forceinline void integrate_surface_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                                       const ShaderData *sd,
+                                                       ccl_global float *ccl_restrict
+                                                           render_buffer)
+{
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Evaluate emissive closure. */
+  float3 L = shader_emissive_eval(sd);
+
+#  ifdef __HAIR__
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
+      (sd->type & PRIMITIVE_ALL_TRIANGLE))
+#  else
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+#  endif
+  {
+    const float bsdf_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float t = sd->ray_length + INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* Multiple importance sampling, get triangle light pdf,
+     * and compute weight with respect to BSDF pdf. */
+    float pdf = triangle_light_pdf(kg, sd, t);
+    float mis_weight = power_heuristic(bsdf_pdf, pdf);
+
+    L *= mis_weight;
+  }
+
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, L, render_buffer);
+}
+#endif /* __EMISSION__ */
+
+#ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) {
+    return;
+  }
+
+  /* Sample position on a light. */
+  LightSample ls ccl_optional_struct_init;
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) {
+      return;
+    }
+  }
+
+  kernel_assert(ls.pdf != 0.0f);
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, &ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D);
+
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
+  bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
+
+  if (ls.shader & SHADER_USE_MIS) {
+    const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+    bsdf_eval_mul(&bsdf_eval, mis_weight);
+  }
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, &ls, &bsdf_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray);
+  const bool is_light = light_sample_is_light(&ls);
+
+  /* Copy volume stack and enter/exit volume. */
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  if (is_transmission) {
+#  ifdef __VOLUME__
+    shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, sd);
+#  endif
+  }
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+  const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#endif
+
+/* Path tracing: bounce off or through surface with new direction. */
+ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE_ARGS,
+                                                                ShaderData *sd,
+                                                                const RNGState *rng_state)
+{
+  /* Sample BSDF or BSSRDF. */
+  if (!(sd->flag & (SD_BSDF | SD_BSSRDF))) {
+    return LABEL_NONE;
+  }
+
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+  const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u);
+
+#ifdef __SUBSURFACE__
+  /* BSSRDF closure, we schedule subsurface intersection kernel. */
+  if (CLOSURE_IS_BSSRDF(sc->type)) {
+    return subsurface_bounce(INTEGRATOR_STATE_PASS, sd, sc);
+  }
+#endif
+
+  /* BSDF closure, sample direction. */
+  float bsdf_pdf;
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  float3 bsdf_omega_in ccl_optional_struct_init;
+  differential3 bsdf_domega_in ccl_optional_struct_init;
+  int label;
+
+  label = shader_bsdf_sample_closure(
+      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray. Note that clipping works through transparent bounces. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(bsdf_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = (label & LABEL_TRANSPARENT) ?
+                                       INTEGRATOR_STATE(ray, t) - sd->ray_length :
+                                       FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(bsdf_domega_in);
+#endif
+
+  /* Update throughput. */
+  float3 throughput = INTEGRATOR_STATE(path, throughput);
+  throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path,
+                             diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+    }
+  }
+
+  /* Update path state */
+  if (label & LABEL_TRANSPARENT) {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = bsdf_pdf;
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(bsdf_pdf,
+                                                      INTEGRATOR_STATE(path, min_ray_pdf));
+  }
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return label;
+}
+
+#ifdef __VOLUME__
+ccl_device_forceinline bool integrate_surface_volume_only_bounce(INTEGRATOR_STATE_ARGS,
+                                                                 ShaderData *sd)
+{
+  if (!path_state_volume_next(INTEGRATOR_STATE_PASS)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray position, direction stays unchanged. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, -sd->Ng);
+
+  /* Clipping works through transparent. */
+  INTEGRATOR_STATE_WRITE(ray, t) -= sd->ray_length;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+#  endif
+
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+
+  return LABEL_TRANSMIT | LABEL_TRANSPARENT;
+}
+#endif
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+ccl_device_forceinline void integrate_surface_ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  ifdef __KERNEL_OPTIX__
+  optixDirectCall<void>(2, INTEGRATOR_STATE_PASS, sd, rng_state, render_buffer);
+}
+
+extern "C" __device__ void __direct_callable__ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  endif /* __KERNEL_OPTIX__ */
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+  const float3 ao_N = shader_bsdf_ao_normal(kg, sd);
+  float3 ao_D;
+  float ao_pdf;
+  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+    Ray ray ccl_optional_struct_init;
+    ray.P = ray_offset(sd->P, sd->Ng);
+    ray.D = ao_D;
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+    ray.time = sd->time;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+
+    Intersection isect ccl_optional_struct_init;
+    if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
+      ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                                 render_buffer);
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
+    }
+  }
+}
+#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+
+template<uint node_feature_mask>
+ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
+                                  ccl_global float *ccl_restrict render_buffer)
+
+{
+  PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_SURFACE_SETUP);
+
+  /* Setup shader data. */
+  ShaderData sd;
+  integrate_surface_shader_setup(INTEGRATOR_STATE_PASS, &sd);
+  PROFILING_SHADER(sd.object, sd.shader);
+
+  int continue_path_label = 0;
+
+  /* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+  if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+#endif
+
+    {
+      const int path_flag = INTEGRATOR_STATE(path, flag);
+#ifdef __SUBSURFACE__
+      /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
+      if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
+#endif
+      {
+        /* Evaluate shader. */
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
+        shader_eval_surface<node_feature_mask>(
+            INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
+      }
+    }
+
+#ifdef __SUBSURFACE__
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      /* When coming from inside subsurface scattering, setup a diffuse
+       * closure to perform lighting at the exit point. */
+      INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SUBSURFACE;
+      subsurface_shader_data_setup(INTEGRATOR_STATE_PASS, &sd);
+    }
+#endif
+
+    shader_prepare_surface_closures(INTEGRATOR_STATE_PASS, &sd);
+
+#ifdef __HOLDOUT__
+    /* Evaluate holdout. */
+    if (!integrate_surface_holdout(INTEGRATOR_STATE_PASS, &sd, render_buffer)) {
+      return false;
+    }
+#endif
+
+#ifdef __EMISSION__
+    /* Write emission. */
+    if (sd.flag & SD_EMISSION) {
+      integrate_surface_emission(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+    }
+#endif
+
+#ifdef __PASSES__
+    /* Write render passes. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES);
+    kernel_write_data_passes(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Load random number state. */
+    RNGState rng_state;
+    path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+    /* Perform path termination. Most paths have already been terminated in
+     * the intersect_closest kernel, this is just for emission and for dividing
+     * throughput by the probability at the right moment. */
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+                                  0.0f :
+                                  path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                      path_flag);
+    if (probability == 0.0f) {
+      return false;
+    }
+    else if (probability != 1.0f) {
+      INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+    }
+
+#ifdef __DENOISING_FEATURES__
+    kernel_write_denoising_features_surface(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+#ifdef __SHADOW_CATCHER__
+    kernel_write_shadow_catcher_bounce_data(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Direct light. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
+    integrate_surface_direct_light(INTEGRATOR_STATE_PASS, &sd, &rng_state);
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+    /* Ambient occlusion pass. */
+    if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+          (INTEGRATOR_STATE(path, flag) & PATH_RAY_CAMERA)) {
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+        integrate_surface_ao_pass(INTEGRATOR_STATE_PASS, &sd, &rng_state, render_buffer);
+      }
+    }
+#endif
+
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_bsdf_bssrdf_bounce(
+        INTEGRATOR_STATE_PASS, &sd, &rng_state);
+#ifdef __VOLUME__
+  }
+  else {
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_volume_only_bounce(INTEGRATOR_STATE_PASS, &sd);
+  }
+
+  if (continue_path_label & LABEL_TRANSMIT) {
+    /* Enter/Exit volume. */
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, &sd);
+  }
+#endif
+
+  return continue_path_label != 0;
+}
+
+template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE,
+         int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
+ccl_device_forceinline void integrator_shade_surface(INTEGRATOR_STATE_ARGS,
+                                                     ccl_global float *ccl_restrict render_buffer)
+{
+  if (integrate_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, render_buffer)) {
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+    }
+    else {
+      kernel_assert(INTEGRATOR_STATE(ray, t) != 0.0f);
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+  }
+  else {
+    INTEGRATOR_PATH_TERMINATE(current_kernel);
+  }
+}
+
+ccl_device_forceinline void integrator_shade_surface_raytrace(
+    INTEGRATOR_STATE_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  integrator_shade_surface<KERNEL_FEATURE_NODE_MASK_SURFACE,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE>(INTEGRATOR_STATE_PASS,
+                                                                            render_buffer);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
new file mode 100644
index 00000000000..4a864b1e6ce
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -0,0 +1,1015 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Events for probalistic scattering */
+
+typedef enum VolumeIntegrateEvent {
+  VOLUME_PATH_SCATTERED = 0,
+  VOLUME_PATH_ATTENUATED = 1,
+  VOLUME_PATH_MISSED = 2
+} VolumeIntegrateEvent;
+
+typedef struct VolumeIntegrateResult {
+  /* Throughput and offset for direct light scattering. */
+  bool direct_scatter;
+  float3 direct_throughput;
+  float direct_t;
+  ShaderVolumePhases direct_phases;
+
+  /* Throughput and offset for indirect light scattering. */
+  bool indirect_scatter;
+  float3 indirect_throughput;
+  float indirect_t;
+  ShaderVolumePhases indirect_phases;
+} VolumeIntegrateResult;
+
+/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
+ * and precision issues.
+ * todo: this value could be tweaked or turned into a probability to avoid unnecessary
+ * work in volumes and subsurface scattering. */
+#  define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+/* Volume shader properties
+ *
+ * extinction coefficient = absorption coefficient + scattering coefficient
+ * sigma_t = sigma_a + sigma_s */
+
+typedef struct VolumeShaderCoefficients {
+  float3 sigma_t;
+  float3 sigma_s;
+  float3 emission;
+} VolumeShaderCoefficients;
+
+/* Evaluate shader to get extinction coefficient at P. */
+ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                                   ShaderData *ccl_restrict sd,
+                                                   float3 *ccl_restrict extinction)
+{
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & SD_EXTINCTION)) {
+    return false;
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  *extinction = sd->closure_transparent_extinction * density;
+  return true;
+}
+
+/* Evaluate shader to get absorption, scattering and emission at P. */
+ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                            ShaderData *ccl_restrict sd,
+                                            VolumeShaderCoefficients *coeff)
+{
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
+    return false;
+  }
+
+  coeff->sigma_s = zero_float3();
+  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
+  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
+
+  if (sd->flag & SD_SCATTER) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_VOLUME(sc->type)) {
+        coeff->sigma_s += sc->weight;
+      }
+    }
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  coeff->sigma_s *= density;
+  coeff->sigma_t *= density;
+  coeff->emission *= density;
+
+  return true;
+}
+
+ccl_device_forceinline void volume_step_init(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             const float object_step_size,
+                                             float t,
+                                             float *step_size,
+                                             float *step_shade_offset,
+                                             float *steps_offset,
+                                             int *max_steps)
+{
+  if (object_step_size == FLT_MAX) {
+    /* Homogeneous volume. */
+    *step_size = t;
+    *step_shade_offset = 0.0f;
+    *steps_offset = 1.0f;
+    *max_steps = 1;
+  }
+  else {
+    /* Heterogeneous volume. */
+    *max_steps = kernel_data.integrator.volume_max_steps;
+    float step = min(object_step_size, t);
+
+    /* compute exact steps in advance for malloc */
+    if (t > *max_steps * step) {
+      step = t / (float)*max_steps;
+    }
+
+    *step_size = step;
+
+    /* Perform shading at this offset within a step, to integrate over
+     * over the entire step segment. */
+    *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4);
+
+    /* Shift starting point of all segment by this random amount to avoid
+     * banding artifacts from the volume bounding shape. */
+    *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3);
+  }
+}
+
+/* Volume Shadows
+ *
+ * These functions are used to attenuate shadow rays to lights. Both absorption
+ * and scattering will block light, represented by the extinction coefficient. */
+
+#  if 0
+/* homogeneous volume: assume shader evaluation at the starts gives
+ * the extinction coefficient for the entire line segment */
+ccl_device void volume_shadow_homogeneous(INTEGRATOR_STATE_ARGS,
+                                          Ray *ccl_restrict ray,
+                                          ShaderData *ccl_restrict sd,
+                                          float3 *ccl_restrict throughput)
+{
+  float3 sigma_t = zero_float3();
+
+  if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+    *throughput *= volume_color_transmittance(sigma_t, ray->t);
+  }
+}
+#  endif
+
+/* heterogeneous volume: integrate stepping through the volume until we
+ * reach the end, get absorbed entirely, or run out of iterations */
+ccl_device void volume_shadow_heterogeneous(INTEGRATOR_STATE_ARGS,
+                                            Ray *ccl_restrict ray,
+                                            ShaderData *ccl_restrict sd,
+                                            float3 *ccl_restrict throughput,
+                                            const float object_step_size)
+{
+  /* Load random number state. */
+  RNGState rng_state;
+  shadow_path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  float3 tp = *throughput;
+
+  /* Prepare for stepping.
+   * For shadows we do not offset all segments, since the starting point is
+   * already a random distance inside the volume. It also appears to create
+   * banding artifacts for unknown reasons. */
+  int max_steps;
+  float step_size, step_shade_offset, unused;
+  volume_step_init(kg,
+                   &rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &unused,
+                   &max_steps);
+  const float steps_offset = 1.0f;
+
+  /* compute extinction at the start */
+  float t = 0.0f;
+
+  float3 sum = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* advance to new position */
+    float new_t = min(ray->t, (i + steps_offset) * step_size);
+    float dt = new_t - t;
+
+    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
+    float3 sigma_t = zero_float3();
+
+    /* compute attenuation over segment */
+    sd->P = new_P;
+    if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+      /* Compute expf() only for every Nth step, to save some calculations
+       * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
+       * check then. */
+      sum += (-sigma_t * dt);
+      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
+        tp = *throughput * exp3(sum);
+
+        /* stop if nearly all light is blocked */
+        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
+            tp.z < VOLUME_THROUGHPUT_EPSILON)
+          break;
+      }
+    }
+
+    /* stop if at the end of the volume */
+    t = new_t;
+    if (t == ray->t) {
+      /* Update throughput in case we haven't done it above */
+      tp = *throughput * exp3(sum);
+      break;
+    }
+  }
+
+  *throughput = tp;
+}
+
+/* Equi-angular sampling as in:
+ * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+
+ccl_device float volume_equiangular_sample(const Ray *ccl_restrict ray,
+                                           const float3 light_P,
+                                           const float xi,
+                                           float *pdf)
+{
+  const float t = ray->t;
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
+  if (UNLIKELY(theta_b == theta_a)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return min(t, delta + t_); /* min is only for float precision errors */
+}
+
+ccl_device float volume_equiangular_pdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return pdf;
+}
+
+ccl_device float volume_equiangular_cdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  float delta = dot((light_P - ray->P), ray->D);
+  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float theta_sample = atan2f(t_, D);
+  const float cdf = (theta_sample - theta_a) / (theta_b - theta_a);
+
+  return cdf;
+}
+
+/* Distance sampling */
+
+ccl_device float volume_distance_sample(
+    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
+{
+  /* xi is [0, 1[ so log(0) should never happen, division by zero is
+   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
+  float sample_sigma_t = volume_channel_get(sigma_t, channel);
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float sample_transmittance = volume_channel_get(full_transmittance, channel);
+
+  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
+
+  *transmittance = volume_color_transmittance(sigma_t, sample_t);
+  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
+
+  /* todo: optimization: when taken together with hit/miss decision,
+   * the full_transmittance cancels out drops out and xi does not
+   * need to be remapped */
+
+  return sample_t;
+}
+
+ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+{
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+
+  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
+}
+
+/* Emission */
+
+ccl_device float3 volume_emission_integrate(VolumeShaderCoefficients *coeff,
+                                            int closure_flag,
+                                            float3 transmittance,
+                                            float t)
+{
+  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+   * this goes to E * t as sigma_t goes to zero
+   *
+   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+  float3 emission = coeff->emission;
+
+  if (closure_flag & SD_EXTINCTION) {
+    float3 sigma_t = coeff->sigma_t;
+
+    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
+    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
+    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
+  }
+  else
+    emission *= t;
+
+  return emission;
+}
+
+/* Volume Integration */
+
+typedef struct VolumeIntegrateState {
+  /* Volume segment extents. */
+  float start_t;
+  float end_t;
+
+  /* If volume is absorption-only up to this point, and no probabilistic
+   * scattering or termination has been used yet. */
+  bool absorption_only;
+
+  /* Random numbers for scattering. */
+  float rscatter;
+  float rphase;
+
+  /* Multiple importance sampling. */
+  VolumeSampleMethod direct_sample_method;
+  bool use_mis;
+  float distance_pdf;
+  float equiangular_pdf;
+} VolumeIntegrateState;
+
+ccl_device_forceinline void volume_integrate_step_scattering(
+    const ShaderData *sd,
+    const Ray *ray,
+    const float3 equiangular_light_P,
+    const VolumeShaderCoefficients &ccl_restrict coeff,
+    const float3 transmittance,
+    VolumeIntegrateState &ccl_restrict vstate,
+    VolumeIntegrateResult &ccl_restrict result)
+{
+  /* Pick random color channel, we use the Veach one-sample
+   * model with balance heuristic for the channels. */
+  const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+  float3 channel_pdf;
+  const int channel = volume_sample_channel(
+      albedo, result.indirect_throughput, vstate.rphase, &channel_pdf);
+
+  /* Equiangular sampling for direct lighting. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
+    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) {
+      const float new_dt = result.direct_t - vstate.start_t;
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+
+      result.direct_scatter = true;
+      result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf;
+      shader_copy_volume_phases(&result.direct_phases, sd);
+
+      /* Multiple importance sampling. */
+      if (vstate.use_mis) {
+        const float distance_pdf = vstate.distance_pdf *
+                                   dot(channel_pdf, coeff.sigma_t * new_transmittance);
+        const float mis_weight = 2.0f * power_heuristic(vstate.equiangular_pdf, distance_pdf);
+        result.direct_throughput *= mis_weight;
+      }
+    }
+    else {
+      result.direct_throughput *= transmittance;
+      vstate.distance_pdf *= dot(channel_pdf, transmittance);
+    }
+  }
+
+  /* Distance sampling for indirect and optional direct lighting. */
+  if (!result.indirect_scatter) {
+    /* decide if we will scatter or continue */
+    const float sample_transmittance = volume_channel_get(transmittance, channel);
+
+    if (1.0f - vstate.rscatter >= sample_transmittance) {
+      /* compute sampling distance */
+      const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel);
+      const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t;
+      const float new_t = vstate.start_t + new_dt;
+
+      /* transmittance and pdf */
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);
+
+      /* throughput */
+      result.indirect_scatter = true;
+      result.indirect_t = new_t;
+      result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
+      shader_copy_volume_phases(&result.indirect_phases, sd);
+
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        /* If using distance sampling for direct light, just copy parameters
+         * of indirect light since we scatter at the same point then. */
+        result.direct_scatter = true;
+        result.direct_t = result.indirect_t;
+        result.direct_throughput = result.indirect_throughput;
+        shader_copy_volume_phases(&result.direct_phases, sd);
+
+        /* Multiple importance sampling. */
+        if (vstate.use_mis) {
+          const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
+          const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
+                                                   equiangular_pdf);
+          result.direct_throughput *= 2.0f * mis_weight;
+        }
+      }
+    }
+    else {
+      /* throughput */
+      const float pdf = dot(channel_pdf, transmittance);
+      result.indirect_throughput *= transmittance / pdf;
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        vstate.distance_pdf *= pdf;
+      }
+
+      /* remap rscatter so we can reuse it and keep thing stratified */
+      vstate.rscatter = 1.0f - (1.0f - vstate.rscatter) / sample_transmittance;
+    }
+  }
+}
+
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probabilistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device_forceinline void volume_integrate_heterogeneous(
+    INTEGRATOR_STATE_ARGS,
+    Ray *ccl_restrict ray,
+    ShaderData *ccl_restrict sd,
+    const RNGState *rng_state,
+    ccl_global float *ccl_restrict render_buffer,
+    const float object_step_size,
+    const VolumeSampleMethod direct_sample_method,
+    const float3 equiangular_light_P,
+    VolumeIntegrateResult &result)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INTEGRATE);
+
+  /* Prepare for stepping.
+   * Using a different step offset for the first step avoids banding artifacts. */
+  int max_steps;
+  float step_size, step_shade_offset, steps_offset;
+  volume_step_init(kg,
+                   rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &steps_offset,
+                   &max_steps);
+
+  /* Initialize volume integration state. */
+  VolumeIntegrateState vstate ccl_optional_struct_init;
+  vstate.start_t = 0.0f;
+  vstate.end_t = 0.0f;
+  vstate.absorption_only = true;
+  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
+  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
+
+  /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
+  vstate.direct_sample_method = direct_sample_method;
+  vstate.use_mis = (direct_sample_method == VOLUME_SAMPLE_MIS);
+  if (vstate.use_mis) {
+    if (vstate.rscatter < 0.5f) {
+      vstate.rscatter *= 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_DISTANCE;
+    }
+    else {
+      vstate.rscatter = (vstate.rscatter - 0.5f) * 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+  }
+  vstate.equiangular_pdf = 0.0f;
+  vstate.distance_pdf = 1.0f;
+
+  /* Initialize volume integration result. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  result.direct_throughput = throughput;
+  result.indirect_throughput = throughput;
+
+  /* Equiangular sampling: compute distance and PDF in advance. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) {
+    result.direct_t = volume_equiangular_sample(
+        ray, equiangular_light_P, vstate.rscatter, &vstate.equiangular_pdf);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  const bool write_denoising_features = (INTEGRATOR_STATE(path, flag) &
+                                         PATH_RAY_DENOISING_FEATURES);
+  float3 accum_albedo = zero_float3();
+#  endif
+  float3 accum_emission = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* Advance to new position */
+    vstate.end_t = min(ray->t, (i + steps_offset) * step_size);
+    const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset;
+    sd->P = ray->P + ray->D * shade_t;
+
+    /* compute segment */
+    VolumeShaderCoefficients coeff ccl_optional_struct_init;
+    if (volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &coeff)) {
+      const int closure_flag = sd->flag;
+
+      /* Evaluate transmittance over segment. */
+      const float dt = (vstate.end_t - vstate.start_t);
+      const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
+                                       volume_color_transmittance(coeff.sigma_t, dt) :
+                                       one_float3();
+
+      /* Emission. */
+      if (closure_flag & SD_EMISSION) {
+        /* Only write emission before indirect light scatter position, since we terminate
+         * stepping at that point if we have already found a direct light scatter position. */
+        if (!result.indirect_scatter) {
+          const float3 emission = volume_emission_integrate(
+              &coeff, closure_flag, transmittance, dt);
+          accum_emission += emission;
+        }
+      }
+
+      if (closure_flag & SD_EXTINCTION) {
+        if ((closure_flag & SD_SCATTER) || !vstate.absorption_only) {
+#  ifdef __DENOISING_FEATURES__
+          /* Accumulate albedo for denoising features. */
+          if (write_denoising_features && (closure_flag & SD_SCATTER)) {
+            const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+            accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance);
+          }
+#  endif
+
+          /* Scattering and absorption. */
+          volume_integrate_step_scattering(
+              sd, ray, equiangular_light_P, coeff, transmittance, vstate, result);
+        }
+        else {
+          /* Absorption only. */
+          result.indirect_throughput *= transmittance;
+          result.direct_throughput *= transmittance;
+        }
+
+        /* Stop if nearly all light blocked. */
+        if (!result.indirect_scatter) {
+          if (max3(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            result.indirect_throughput = zero_float3();
+            break;
+          }
+        }
+        else if (!result.direct_scatter) {
+          if (max3(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            break;
+          }
+        }
+      }
+
+      /* If we have scattering data for both direct and indirect, we're done. */
+      if (result.direct_scatter && result.indirect_scatter) {
+        break;
+      }
+    }
+
+    /* Stop if at the end of the volume. */
+    vstate.start_t = vstate.end_t;
+    if (vstate.start_t == ray->t) {
+      break;
+    }
+  }
+
+  /* Write accumulated emisison. */
+  if (!is_zero(accum_emission)) {
+    kernel_accum_emission(
+        INTEGRATOR_STATE_PASS, result.indirect_throughput, accum_emission, render_buffer);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  /* Write denoising features. */
+  if (write_denoising_features) {
+    kernel_write_denoising_features_volume(
+        INTEGRATOR_STATE_PASS, accum_albedo, result.indirect_scatter, render_buffer);
+  }
+#  endif /* __DENOISING_FEATURES__ */
+}
+
+#  ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline bool integrate_volume_sample_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          LightSample *ccl_restrict ls)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!kernel_data.integrator.use_direct_light) {
+    return false;
+  }
+
+  /* Sample position on a light. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const uint bounce = INTEGRATOR_STATE(path, bounce);
+  float light_u, light_v;
+  path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+  light_distribution_sample_from_volume_segment(
+      kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls);
+
+  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+    return false;
+  }
+
+  return true;
+}
+
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          const float3 P,
+                                                          const ShaderVolumePhases *ccl_restrict
+                                                              phases,
+                                                          const float3 throughput,
+                                                          LightSample *ccl_restrict ls)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);
+
+  if (!kernel_data.integrator.use_direct_light) {
+    return;
+  }
+
+  /* Sample position on the same light again, now from the shading
+   * point where we scattered.
+   *
+   * TODO: decorrelate random numbers and use light_sample_new_position to
+   * avoid resampling the CDF. */
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) {
+      return;
+    }
+  }
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  BsdfEval phase_eval ccl_optional_struct_init;
+  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
+
+  if (ls->shader & SHADER_USE_MIS) {
+    float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+    bsdf_eval_mul(&phase_eval, mis_weight);
+  }
+
+  bsdf_eval_mul3(&phase_eval, light_eval / ls->pdf);
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, ls, &phase_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
+  const bool is_light = light_sample_is_light(ls);
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= PATH_RAY_VOLUME_PASS;
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            one_float3() :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#  endif
+
+/* Path tracing: scatter in new direction using phase function */
+ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state,
+                                                           const ShaderVolumePhases *phases)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT);
+
+  float phase_u, phase_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v);
+
+  /* Phase closure, sample direction. */
+  float phase_pdf;
+  BsdfEval phase_eval ccl_optional_struct_init;
+  float3 phase_omega_in ccl_optional_struct_init;
+  differential3 phase_domega_in ccl_optional_struct_init;
+
+  const int label = shader_volume_phase_sample(kg,
+                                               sd,
+                                               phases,
+                                               phase_u,
+                                               phase_v,
+                                               &phase_eval,
+                                               &phase_omega_in,
+                                               &phase_domega_in,
+                                               &phase_pdf);
+
+  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
+    return false;
+  }
+
+  /* Setup ray. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(phase_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(phase_domega_in);
+#  endif
+
+  /* Update throughput. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+  }
+
+  /* Update path state */
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(phase_pdf,
+                                                    INTEGRATOR_STATE(path, min_ray_pdf));
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return true;
+}
+
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device VolumeIntegrateEvent volume_integrate(INTEGRATOR_STATE_ARGS,
+                                                 Ray *ccl_restrict ray,
+                                                 ccl_global float *ccl_restrict render_buffer)
+{
+  ShaderData sd;
+  shader_setup_from_volume(kg, &sd, ray);
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* Sample light ahead of volume stepping, for equiangular sampling. */
+  /* TODO: distant lights are ignored now, but could instead use even distribution. */
+  LightSample ls ccl_optional_struct_init;
+  const bool need_light_sample = !(INTEGRATOR_STATE(path, flag) & PATH_RAY_TERMINATE);
+  const bool have_equiangular_sample = need_light_sample &&
+                                       integrate_volume_sample_light(
+                                           INTEGRATOR_STATE_PASS, &sd, &rng_state, &ls) &&
+                                       (ls.t != FLT_MAX);
+
+  VolumeSampleMethod direct_sample_method = (have_equiangular_sample) ?
+                                                volume_stack_sample_method(INTEGRATOR_STATE_PASS) :
+                                                VOLUME_SAMPLE_DISTANCE;
+
+  /* Step through volume. */
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  /* TODO: expensive to zero closures? */
+  VolumeIntegrateResult result = {};
+  volume_integrate_heterogeneous(INTEGRATOR_STATE_PASS,
+                                 ray,
+                                 &sd,
+                                 &rng_state,
+                                 render_buffer,
+                                 step_size,
+                                 direct_sample_method,
+                                 ls.P,
+                                 result);
+
+  /* Perform path termination. The intersect_closest will have already marked this path
+   * to be terminated. That will shading evaluating to leave out any scattering closures,
+   * but emission and absorption are still handled for multiple importance sampling. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
+                                0.0f :
+                                path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                    path_flag);
+  if (probability == 0.0f) {
+    return VOLUME_PATH_MISSED;
+  }
+
+  /* Direct light. */
+  if (result.direct_scatter) {
+    const float3 direct_P = ray->P + result.direct_t * ray->D;
+    result.direct_throughput /= probability;
+    integrate_volume_direct_light(INTEGRATOR_STATE_PASS,
+                                  &sd,
+                                  &rng_state,
+                                  direct_P,
+                                  &result.direct_phases,
+                                  result.direct_throughput,
+                                  &ls);
+  }
+
+  /* Indirect light.
+   *
+   * Only divide throughput by probability if we scatter. For the attenuation
+   * case the next surface will already do this division. */
+  if (result.indirect_scatter) {
+    result.indirect_throughput /= probability;
+  }
+  INTEGRATOR_STATE_WRITE(path, throughput) = result.indirect_throughput;
+
+  if (result.indirect_scatter) {
+    sd.P = ray->P + result.indirect_t * ray->D;
+
+    if (integrate_volume_phase_scatter(
+            INTEGRATOR_STATE_PASS, &sd, &rng_state, &result.indirect_phases)) {
+      return VOLUME_PATH_SCATTERED;
+    }
+    else {
+      return VOLUME_PATH_MISSED;
+    }
+  }
+  else {
+    return VOLUME_PATH_ATTENUATED;
+  }
+}
+
+#endif
+
+ccl_device void integrator_shade_volume(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_SETUP);
+
+#ifdef __VOLUME__
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  /* Set ray length to current segment. */
+  ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
+
+  /* Clean volume stack for background rays. */
+  if (isect.prim == PRIM_NONE) {
+    volume_stack_clean(INTEGRATOR_STATE_PASS);
+  }
+
+  VolumeIntegrateEvent event = volume_integrate(INTEGRATOR_STATE_PASS, &ray, render_buffer);
+
+  if (event == VOLUME_PATH_SCATTERED) {
+    /* Queue intersect_closest kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+  else if (event == VOLUME_PATH_MISSED) {
+    /* End path. */
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    return;
+  }
+  else {
+    /* Continue to background, light or surface. */
+    if (isect.prim == PRIM_NONE) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      return;
+    }
+    else if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+          INTEGRATOR_STATE_PASS, &isect, shader, flags);
+      return;
+    }
+  }
+#endif /* __VOLUME__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
new file mode 100644
index 00000000000..8cef9cf31e2
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Integrator State
+ *
+ * This file defines the data structures that define the state of a path. Any state that is
+ * preserved and passed between kernel executions is part of this.
+ *
+ * The size of this state must be kept as small as possible, to reduce cache misses and keep memory
+ * usage under control on GPUs that may execute millions of kernels.
+ *
+ * Memory may be allocated and passed along in different ways depending on the device. There may
+ * be a scalar layout, or AoS or SoA layout for batches. The state may be passed along as a pointer
+ * to every kernel, or the pointer may exist at program scope or in constant memory. To abstract
+ * these differences between devices and experiment with different layouts, macros are used.
+ *
+ * INTEGRATOR_STATE_ARGS: prepend to argument definitions for every function that accesses
+ * path state.
+ * INTEGRATOR_STATE_CONST_ARGS: same as INTEGRATOR_STATE_ARGS, when state is read-only
+ * INTEGRATOR_STATE_PASS: use to pass along state to other functions access it.
+ *
+ * INTEGRATOR_STATE(x, y): read nested struct member x.y of IntegratorState
+ * INTEGRATOR_STATE_WRITE(x, y): write to nested struct member x.y of IntegratorState
+ *
+ * INTEGRATOR_STATE_ARRAY(x, index, y): read x[index].y
+ * INTEGRATOR_STATE_ARRAY_WRITE(x, index, y): write x[index].y
+ *
+ * INTEGRATOR_STATE_COPY(to_x, from_x): copy contents of one nested struct to another
+ *
+ * INTEGRATOR_STATE_IS_NULL: test if any integrator state is available, for shader evaluation
+ * INTEGRATOR_STATE_PASS_NULL: use to pass empty state to other functions.
+ *
+ * NOTE: if we end up with a device that passes no arguments, the leading comma will be a problem.
+ * Can solve it with more macros if we encouter it, but rather ugly so postpone for now.
+ */
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_types.h"
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Constants
+ *
+ * TODO: these could be made dynamic depending on the features used in the scene. */
+
+#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
+#define INTEGRATOR_SHADOW_ISECT_SIZE 4
+
+/* Data structures */
+
+/* Integrator State
+ *
+ * CPU rendering path state with AoS layout. */
+typedef struct IntegratorStateCPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+} IntegratorStateCPU;
+
+/* Path Queue
+ *
+ * Keep track of which kernels are queued to be executed next in the path
+ * for GPU rendering. */
+typedef struct IntegratorQueueCounter {
+  int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM];
+} IntegratorQueueCounter;
+
+/* Integrator State GPU
+ *
+ * GPU rendering path state with SoA layout. */
+typedef struct IntegratorStateGPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+
+  /* Count number of queued kernels. */
+  IntegratorQueueCounter *queue_counter;
+
+  /* Count number of kernels queued for specific shaders. */
+  int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
+
+  /* Index of path which will be used by a next shadow catcher split.  */
+  int *next_shadow_catcher_path_index;
+} IntegratorStateGPU;
+
+/* Abstraction
+ *
+ * Macros to access data structures on different devices.
+ *
+ * Note that there is a special access function for the shadow catcher state. This access is to
+ * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
+ * from a kernel which operates on a shadow catcher state will cause bad memory acces. */
+
+#ifdef __KERNEL_CPU__
+
+/* Scalar access on CPU. */
+
+typedef IntegratorStateCPU *ccl_restrict IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        const IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, NULL
+#  define INTEGRATOR_STATE_IS_NULL (state == NULL)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct.member)
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) (state->nested_struct.member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct[array_index].member)
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    ((state)->nested_struct[array_index].member)
+
+#else /* __KERNEL_CPU__ */
+
+/* Array access on GPU with Structure-of-Arrays. */
+
+typedef int IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, -1
+#  define INTEGRATOR_STATE_IS_NULL (state == -1)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    kernel_integrator_state.nested_struct.member[state]
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) INTEGRATOR_STATE(nested_struct, member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    kernel_integrator_state.nested_struct[array_index].member[state]
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member)
+
+#endif /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
new file mode 100644
index 00000000000..8477efd7b66
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+#include "util/util_atomic.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Control Flow
+ *
+ * Utilities for control flow between kernels. The implementation may differ per device
+ * or even be handled on the host side. To abstract such differences, experiment with
+ * different implementations and for debugging, this is abstracted using macros.
+ *
+ * There is a main path for regular path tracing camera for path tracing. Shadows for next
+ * event estimation branch off from this into their own path, that may be computed in
+ * parallel while the main path continues.
+ *
+ * Each kernel on the main path must call one of these functions. These may not be called
+ * multiple times from the same kernel.
+ *
+ * INTEGRATOR_PATH_INIT(next_kernel)
+ * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
+ * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ *
+ * For the shadow path similar functions are used, and again each shadow kernel must call
+ * one of them, and only once.
+ */
+
+#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(path, queued_kernel) == 0)
+#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0)
+
+#ifdef __KERNEL_GPU__
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_sub_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+
+#else
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+    }
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+      (void)current_kernel; \
+    }
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
new file mode 100644
index 00000000000..41dd1bfcdbf
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -0,0 +1,163 @@
+
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/************************************ Path State *****************************/
+
+KERNEL_STRUCT_BEGIN(path)
+/* Index of a pixel within the device render buffer where this path will write its result.
+ * To get an actual offset within the buffer the value needs to be multiplied by the
+ * `kernel_data.film.pass_stride`.
+ *
+ * The multiplication is delayed for later, so that state can use 32bit integer. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
+/* Current sample number. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current diffuse ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current glossy ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transmission ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume bounds ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* Random number generator seed. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
+/* Random number dimension offset. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Multiple importance sampling
+ * The PDF of BSDF sampling at the last scatter point, and distance to the
+ * last scatter point minus the last ray segment. This distance lets us
+ * compute the complete distance through transparent surfaces and volumes. */
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
+/* Filter glossy. */
+KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Denoising. */
+KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+/* Shader sorting. */
+/* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(path)
+
+/************************************** Ray ***********************************/
+
+KERNEL_STRUCT_BEGIN(ray)
+KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(ray)
+
+/*************************** Intersection result ******************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(isect)
+KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_MEMBER(isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(isect)
+
+/*************** Subsurface closure state for subsurface kernel ***************/
+
+KERNEL_STRUCT_BEGIN(subsurface)
+KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, roughness, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_END(subsurface)
+
+/********************************** Volume Stack ******************************/
+
+KERNEL_STRUCT_BEGIN(volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+
+/********************************* Shadow Path State **************************/
+
+KERNEL_STRUCT_BEGIN(shadow_path)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput for shadow pass. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Number of intersections found by ray-tracing. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_path)
+
+/********************************** Shadow Ray *******************************/
+
+KERNEL_STRUCT_BEGIN(shadow_ray)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_ray)
+
+/*********************** Shadow Intersection result **************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(shadow_isect)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
+
+/**************************** Shadow Volume Stack *****************************/
+
+KERNEL_STRUCT_BEGIN(shadow_volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
new file mode 100644
index 00000000000..cdf412fe22f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/kernel_differential.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray */
+
+ccl_device_forceinline void integrator_state_write_ray(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(ray, dP) = ray->dP;
+  INTEGRATOR_STATE_WRITE(ray, dD) = ray->dD;
+}
+
+ccl_device_forceinline void integrator_state_read_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                      Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(ray, P);
+  ray->D = INTEGRATOR_STATE(ray, D);
+  ray->t = INTEGRATOR_STATE(ray, t);
+  ray->time = INTEGRATOR_STATE(ray, time);
+  ray->dP = INTEGRATOR_STATE(ray, dP);
+  ray->dD = INTEGRATOR_STATE(ray, dD);
+}
+
+/* Shadow Ray */
+
+ccl_device_forceinline void integrator_state_write_shadow_ray(INTEGRATOR_STATE_ARGS,
+                                                              const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(shadow_ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(shadow_ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(shadow_ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(shadow_ray, dP) = ray->dP;
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                             Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(shadow_ray, P);
+  ray->D = INTEGRATOR_STATE(shadow_ray, D);
+  ray->t = INTEGRATOR_STATE(shadow_ray, t);
+  ray->time = INTEGRATOR_STATE(shadow_ray, time);
+  ray->dP = INTEGRATOR_STATE(shadow_ray, dP);
+  ray->dD = differential_zero_compact();
+}
+
+/* Intersection */
+
+ccl_device_forceinline void integrator_state_write_isect(INTEGRATOR_STATE_ARGS,
+                                                         const Intersection *ccl_restrict isect)
+{
+  INTEGRATOR_STATE_WRITE(isect, t) = isect->t;
+  INTEGRATOR_STATE_WRITE(isect, u) = isect->u;
+  INTEGRATOR_STATE_WRITE(isect, v) = isect->v;
+  INTEGRATOR_STATE_WRITE(isect, object) = isect->object;
+  INTEGRATOR_STATE_WRITE(isect, prim) = isect->prim;
+  INTEGRATOR_STATE_WRITE(isect, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_WRITE(isect, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                        Intersection *ccl_restrict isect)
+{
+  isect->prim = INTEGRATOR_STATE(isect, prim);
+  isect->object = INTEGRATOR_STATE(isect, object);
+  isect->type = INTEGRATOR_STATE(isect, type);
+  isect->u = INTEGRATOR_STATE(isect, u);
+  isect->v = INTEGRATOR_STATE(isect, v);
+  isect->t = INTEGRATOR_STATE(isect, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE(isect, Ng);
+#endif
+}
+
+ccl_device_forceinline VolumeStack integrator_state_read_volume_stack(INTEGRATOR_STATE_CONST_ARGS,
+                                                                      int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline void integrator_state_write_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                int i,
+                                                                VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, shader) = entry.shader;
+}
+
+ccl_device_forceinline bool integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+/* Shadow Intersection */
+
+ccl_device_forceinline void integrator_state_write_shadow_isect(
+    INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect, const int index)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, t) = isect->t;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, u) = isect->u;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, v) = isect->v;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, object) = isect->object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, prim) = isect->prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                               Intersection *ccl_restrict isect,
+                                                               const int index)
+{
+  isect->prim = INTEGRATOR_STATE_ARRAY(shadow_isect, index, prim);
+  isect->object = INTEGRATOR_STATE_ARRAY(shadow_isect, index, object);
+  isect->type = INTEGRATOR_STATE_ARRAY(shadow_isect, index, type);
+  isect->u = INTEGRATOR_STATE_ARRAY(shadow_isect, index, u);
+  isect->v = INTEGRATOR_STATE_ARRAY(shadow_isect, index, v);
+  isect->t = INTEGRATOR_STATE_ARRAY(shadow_isect, index, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE_ARRAY(shadow_isect, index, Ng);
+#endif
+}
+
+ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, object);
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, shader);
+    }
+  }
+}
+
+ccl_device_forceinline VolumeStack
+integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_CONST_ARGS, int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty(
+    INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(shadow_volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+ccl_device_forceinline void integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                       int i,
+                                                                       VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = entry.shader;
+}
+
+#if defined(__KERNEL_GPU__)
+ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state,
+                                                  const IntegratorState state)
+{
+  int index;
+
+  /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+#  define KERNEL_STRUCT_BEGIN(name) \
+    index = 0; \
+    do {
+
+#  define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct.name != nullptr) { \
+      kernel_integrator_state.parent_struct.name[to_state] = \
+          kernel_integrator_state.parent_struct.name[state]; \
+    }
+
+#  define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+      kernel_integrator_state.parent_struct[index].name[to_state] = \
+          kernel_integrator_state.parent_struct[index].name[state]; \
+    }
+
+#  define KERNEL_STRUCT_END(name) \
+    } \
+    while (false) \
+      ;
+
+#  define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+    ++index; \
+    } \
+    while (index < array_size) \
+      ;
+
+#  include "kernel/integrator/integrator_state_template.h"
+
+#  undef KERNEL_STRUCT_BEGIN
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_ARRAY_MEMBER
+#  undef KERNEL_STRUCT_END
+#  undef KERNEL_STRUCT_END_ARRAY
+}
+
+ccl_device_inline void integrator_state_move(const IntegratorState to_state,
+                                             const IntegratorState state)
+{
+  integrator_state_copy_only(to_state, state);
+
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
+
+#endif
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_ARGS)
+{
+#if defined(__KERNEL_GPU__)
+  const IntegratorState to_state = atomic_fetch_and_add_uint32(
+      &kernel_integrator_state.next_shadow_catcher_path_index[0], 1);
+
+  integrator_state_copy_only(to_state, state);
+
+  kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+  /* Sanity check: expect to split in the intersect-closest kernel, where there is no shadow ray
+   * and no sorting yet. */
+  kernel_assert(INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+  kernel_assert(kernel_integrator_state.sort_key_counter[INTEGRATOR_STATE(path, queued_kernel)] ==
+                nullptr);
+#else
+
+  IntegratorStateCPU *ccl_restrict split_state = state + 1;
+
+  *split_state = *state;
+
+  split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
new file mode 100644
index 00000000000..9490738404e
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -0,0 +1,623 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
+#include "kernel/closure/volume.h"
+
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+
+ccl_device int subsurface_bounce(INTEGRATOR_STATE_ARGS, ShaderData *sd, const ShaderClosure *sc)
+{
+  /* We should never have two consecutive BSSRDF bounces, the second one should
+   * be converted to a diffuse BSDF to avoid this. */
+  kernel_assert(!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DIFFUSE_ANCESTOR));
+
+  /* Setup path state for intersect_subsurface kernel. */
+  const Bssrdf *bssrdf = (const Bssrdf *)sc;
+
+  /* Setup ray into surface. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = sd->N;
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_zero_compact();
+
+  /* Pass along object info, reusing isect to save memory. */
+  INTEGRATOR_STATE_WRITE(isect, Ng) = sd->Ng;
+  INTEGRATOR_STATE_WRITE(isect, object) = sd->object;
+
+  /* Pass BSSRDF parameters. */
+  const uint32_t path_flag = INTEGRATOR_STATE_WRITE(path, flag);
+  INTEGRATOR_STATE_WRITE(path, flag) = (path_flag & ~PATH_RAY_CAMERA) | PATH_RAY_SUBSURFACE;
+  INTEGRATOR_STATE_WRITE(path, throughput) *= shader_bssrdf_sample_weight(sd, sc);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+    }
+  }
+
+  INTEGRATOR_STATE_WRITE(subsurface, albedo) = bssrdf->albedo;
+  INTEGRATOR_STATE_WRITE(subsurface, radius) = bssrdf->radius;
+  INTEGRATOR_STATE_WRITE(subsurface, roughness) = bssrdf->roughness;
+  INTEGRATOR_STATE_WRITE(subsurface, anisotropy) = bssrdf->anisotropy;
+
+  return LABEL_SUBSURFACE_SCATTER;
+}
+
+ccl_device void subsurface_shader_data_setup(INTEGRATOR_STATE_ARGS, ShaderData *sd)
+{
+  /* Get bump mapped normal from shader evaluation at exit point. */
+  float3 N = sd->N;
+  if (sd->flag & SD_HAS_BSSRDF_BUMP) {
+    N = shader_bssrdf_normal(sd);
+  }
+
+  /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
+  sd->flag &= ~SD_CLOSURE_FLAGS;
+  sd->num_closure = 0;
+  sd->num_closure_left = kernel_data.max_closures;
+
+  const float3 weight = one_float3();
+  const float roughness = INTEGRATOR_STATE(subsurface, roughness);
+
+#  ifdef __PRINCIPLED__
+  if (roughness != FLT_MAX) {
+    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
+        sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      bsdf->roughness = roughness;
+      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular Disney principled diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+    }
+  }
+  else
+#  endif /* __PRINCIPLED__ */
+  {
+    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      sd->flag |= bsdf_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+    }
+  }
+}
+
+/* Random walk subsurface scattering.
+ *
+ * "Practical and Controllable Subsurface Scattering for Production Path
+ *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+
+/* Support for anisotropy from:
+ * "Path Traced Subsurface Scattering using Anisotropic Phase Functions
+ * and Non-Exponential Free Flights".
+ * Magnus Wrenninge, Ryusuke Villemin, Christophe Hery.
+ * https://graphics.pixar.com/library/PathTracedSubsurface/ */
+
+ccl_device void subsurface_random_walk_remap(
+    const float albedo, const float d, float g, float *sigma_t, float *alpha)
+{
+  /* Compute attenuation and scattering coefficients from albedo. */
+  const float g2 = g * g;
+  const float g3 = g2 * g;
+  const float g4 = g3 * g;
+  const float g5 = g4 * g;
+  const float g6 = g5 * g;
+  const float g7 = g6 * g;
+
+  const float A = 1.8260523782f + -1.28451056436f * g + -1.79904629312f * g2 +
+                  9.19393289202f * g3 + -22.8215585862f * g4 + 32.0234874259f * g5 +
+                  -23.6264803333f * g6 + 7.21067002658f * g7;
+  const float B = 4.98511194385f +
+                  0.127355959438f *
+                      expf(31.1491581433f * g + -201.847017512f * g2 + 841.576016723f * g3 +
+                           -2018.09288505f * g4 + 2731.71560286f * g5 + -1935.41424244f * g6 +
+                           559.009054474f * g7);
+  const float C = 1.09686102424f + -0.394704063468f * g + 1.05258115941f * g2 +
+                  -8.83963712726f * g3 + 28.8643230661f * g4 + -46.8802913581f * g5 +
+                  38.5402837518f * g6 + -12.7181042538f * g7;
+  const float D = 0.496310210422f + 0.360146581622f * g + -2.15139309747f * g2 +
+                  17.8896899217f * g3 + -55.2984010333f * g4 + 82.065982243f * g5 +
+                  -58.5106008578f * g6 + 15.8478295021f * g7;
+  const float E = 4.23190299701f +
+                  0.00310603949088f *
+                      expf(76.7316253952f * g + -594.356773233f * g2 + 2448.8834203f * g3 +
+                           -5576.68528998f * g4 + 7116.60171912f * g5 + -4763.54467887f * g6 +
+                           1303.5318055f * g7);
+  const float F = 2.40602999408f + -2.51814844609f * g + 9.18494908356f * g2 +
+                  -79.2191708682f * g3 + 259.082868209f * g4 + -403.613804597f * g5 +
+                  302.85712436f * g6 + -87.4370473567f * g7;
+
+  const float blend = powf(albedo, 0.25f);
+
+  *alpha = (1.0f - blend) * A * powf(atanf(B * albedo), C) +
+           blend * D * powf(atanf(E * albedo), F);
+  *alpha = clamp(*alpha, 0.0f, 0.999999f);  // because of numerical precision
+
+  float sigma_t_prime = 1.0f / fmaxf(d, 1e-16f);
+  *sigma_t = sigma_t_prime / (1.0f - g);
+}
+
+ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
+                                                    const float3 radius,
+                                                    const float anisotropy,
+                                                    float3 *sigma_t,
+                                                    float3 *alpha,
+                                                    float3 *throughput)
+{
+  float sigma_t_x, sigma_t_y, sigma_t_z;
+  float alpha_x, alpha_y, alpha_z;
+
+  subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x);
+  subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y);
+  subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z);
+
+  /* Throughput already contains closure weight at this point, which includes the
+   * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo
+   * which will be added through scattering. */
+  *throughput = safe_divide_color(*throughput, albedo);
+
+  /* With low albedo values (like 0.025) we get diffusion_length 1.0 and
+   * infinite phase functions. To avoid a sharp discontinuity as we go from
+   * such values to 0.0, increase alpha and reduce the throughput to compensate. */
+  const float min_alpha = 0.2f;
+  if (alpha_x < min_alpha) {
+    (*throughput).x *= alpha_x / min_alpha;
+    alpha_x = min_alpha;
+  }
+  if (alpha_y < min_alpha) {
+    (*throughput).y *= alpha_y / min_alpha;
+    alpha_y = min_alpha;
+  }
+  if (alpha_z < min_alpha) {
+    (*throughput).z *= alpha_z / min_alpha;
+    alpha_z = min_alpha;
+  }
+
+  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
+  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
+}
+
+/* References for Dwivedi sampling:
+ *
+ * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
+ * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
+ * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
+ *
+ * [2] "Improving the Dwivedi Sampling Scheme"
+ * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
+ * https://cg.ivd.kit.edu/1951.php
+ *
+ * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
+ * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
+ * https://iliyan.com/publications/RenderingCourse2020
+ */
+
+ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
+{
+  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
+  return 1.0f / ((v - cos_theta) * phase_log);
+}
+
+ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
+{
+  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
+   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
+   * we can implement the power function like this. */
+  return v - (v + 1.0f) * expf(-rand * phase_log);
+}
+
+ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
+{
+  /* Eq. 67 from [3] */
+  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
+}
+
+ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
+{
+  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
+  float phi = M_2PI_F * randv;
+  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
+
+  float3 T, B;
+  make_orthonormals(D, &T, &B);
+  return dir.x * T + dir.y * B + dir.z * D;
+}
+
+ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
+                                                         float t,
+                                                         bool hit,
+                                                         float3 *transmittance)
+{
+  float3 T = volume_color_transmittance(sigma_t, t);
+  if (transmittance) {
+    *transmittance = T;
+  }
+  return hit ? T : sigma_t * T;
+}
+
+/* Define the below variable to get the similarity code active,
+ * and the value represents the cutoff level */
+#  define SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL 9
+
+ccl_device_inline bool subsurface_random_walk(INTEGRATOR_STATE_ARGS,
+                                              RNGState rng_state,
+                                              Ray &ray,
+                                              LocalIntersection &ss_isect)
+{
+  float bssrdf_u, bssrdf_v;
+  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+
+  const float3 P = INTEGRATOR_STATE(ray, P);
+  const float3 N = INTEGRATOR_STATE(ray, D);
+  const float ray_dP = INTEGRATOR_STATE(ray, dP);
+  const float time = INTEGRATOR_STATE(ray, time);
+  const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+  const int object = INTEGRATOR_STATE(isect, object);
+
+  /* Sample diffuse surface scatter into the object. */
+  float3 D;
+  float pdf;
+  sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf);
+  if (dot(-Ng, D) <= 0.0f) {
+    return false;
+  }
+
+  /* Setup ray. */
+  ray.P = ray_offset(P, -Ng);
+  ray.D = D;
+  ray.t = FLT_MAX;
+  ray.time = time;
+  ray.dP = ray_dP;
+  ray.dD = differential_zero_compact();
+
+#  ifndef __KERNEL_OPTIX__
+  /* Compute or fetch object transforms. */
+  Transform ob_itfm ccl_optional_struct_init;
+  Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
+#  endif
+
+  /* Convert subsurface to volume coefficients.
+   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
+  const float3 albedo = INTEGRATOR_STATE(subsurface, albedo);
+  const float3 radius = INTEGRATOR_STATE(subsurface, radius);
+  const float anisotropy = INTEGRATOR_STATE(subsurface, anisotropy);
+
+  float3 sigma_t, alpha;
+  float3 throughput = INTEGRATOR_STATE_WRITE(path, throughput);
+  subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput);
+  float3 sigma_s = sigma_t * alpha;
+
+  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
+   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
+   * for making the code significantly more complex and slower (if direction sampling depends on
+   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
+   *
+   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
+   * is too low results in fireflies while one that's too high just gives a bit more noise.
+   * Therefore, the code here uses the highest of the three albedos to be safe. */
+  const float diffusion_length = diffusion_length_dwivedi(max3(alpha));
+
+  if (diffusion_length == 1.0f) {
+    /* With specific values of alpha the length might become 1, which in asymptotic makes phase to
+     * be infinite. After first bounce it will cause throughput to be 0. Do early output, avoiding
+     * numerical issues and extra unneeded work. */
+    return false;
+  }
+
+  /* Precompute term for phase sampling. */
+  const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f));
+
+  /* Modify state for RNGs, decorrelated from other paths. */
+  rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
+
+  /* Random walk until we hit the surface again. */
+  bool hit = false;
+  bool have_opposite_interface = false;
+  float opposite_distance = 0.0f;
+
+  /* Todo: Disable for alpha>0.999 or so? */
+  /* Our heuristic, a compromise between guiding and classic. */
+  const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f));
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+  float3 sigma_s_star = sigma_s * (1.0f - anisotropy);
+  float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star;
+  float3 sigma_t_org = sigma_t;
+  float3 sigma_s_org = sigma_s;
+  const float anisotropy_org = anisotropy;
+  const float guided_fraction_org = guided_fraction;
+#  endif
+
+  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
+    /* Advance random number offset. */
+    rng_state.rng_offset += PRNG_BOUNCE_NUM;
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+    // shadow with local variables according to depth
+    float anisotropy, guided_fraction;
+    float3 sigma_s, sigma_t;
+    if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) {
+      anisotropy = anisotropy_org;
+      guided_fraction = guided_fraction_org;
+      sigma_t = sigma_t_org;
+      sigma_s = sigma_s_org;
+    }
+    else {
+      anisotropy = 0.0f;
+      guided_fraction = 0.75f;  // back to isotropic heuristic from Blender
+      sigma_t = sigma_t_star;
+      sigma_s = sigma_s_star;
+    }
+#  endif
+
+    /* Sample color channel, use MIS with balance heuristic. */
+    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL);
+    float3 channel_pdf;
+    int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
+    float sample_sigma_t = volume_channel_get(sigma_t, channel);
+    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE);
+
+    /* We need the result of the raycast to compute the full guided PDF, so just remember the
+     * relevant terms to avoid recomputing them later. */
+    float backward_fraction = 0.0f;
+    float forward_pdf_factor = 0.0f;
+    float forward_stretching = 1.0f;
+    float backward_pdf_factor = 0.0f;
+    float backward_stretching = 1.0f;
+
+    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
+    if (bounce > 0) {
+      /* Decide whether we should use guided or classic sampling. */
+      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction);
+
+      /* Determine if we want to sample away from the incoming interface.
+       * This only happens if we found a nearby opposite interface, and the probability for it
+       * depends on how close we are to it already.
+       * This probability term comes from the recorded presentation of [3]. */
+      bool guide_backward = false;
+      if (have_opposite_interface) {
+        /* Compute distance of the random walk between the tangent plane at the starting point
+         * and the assumed opposite interface (the parallel plane that contains the point we
+         * found in our ray query for the opposite side). */
+        float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance);
+        backward_fraction = 1.0f /
+                            (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length));
+        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction;
+      }
+
+      /* Sample scattering direction. */
+      float scatter_u, scatter_v;
+      path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+      float cos_theta;
+      float hg_pdf;
+      if (guided) {
+        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
+        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
+         * sign here is enough to sample from that instead. */
+        if (guide_backward) {
+          cos_theta = -cos_theta;
+        }
+        float3 newD = direction_from_cosine(N, cos_theta, scatter_v);
+        hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy);
+        ray.D = newD;
+      }
+      else {
+        float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf);
+        cos_theta = dot(newD, N);
+        ray.D = newD;
+      }
+
+      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
+       * Since phase sampling is channel-independent, we can get away with applying a factor
+       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
+       * it cancel with an equivalent term in the numerator of the full estimator.
+       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
+       */
+      forward_pdf_factor = M_1_2PI_F * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta) /
+                           hg_pdf;
+      backward_pdf_factor = M_1_2PI_F *
+                            eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta) / hg_pdf;
+
+      /* Prepare distance sampling.
+       * For the backwards case, this also needs the sign swapped since now directions against
+       * sd->N (and therefore with negative cos_theta) are preferred. */
+      forward_stretching = (1.0f - cos_theta / diffusion_length);
+      backward_stretching = (1.0f + cos_theta / diffusion_length);
+      if (guided) {
+        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
+      }
+    }
+
+    /* Sample direction along ray. */
+    float t = -logf(1.0f - randt) / sample_sigma_t;
+
+    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
+     * If yes, we will later use backwards guided sampling in order to have a decent
+     * chance of connecting to it.
+     * Todo: Maybe use less than 10 times the mean free path? */
+    ray.t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
+    scene_intersect_local(kg, &ray, &ss_isect, object, NULL, 1);
+    hit = (ss_isect.num_hits > 0);
+
+    if (hit) {
+#  ifdef __KERNEL_OPTIX__
+      /* t is always in world space with OptiX. */
+      ray.t = ss_isect.hits[0].t;
+#  else
+      /* Compute world space distance to surface hit. */
+      float3 D = transform_direction(&ob_itfm, ray.D);
+      D = normalize(D) * ss_isect.hits[0].t;
+      ray.t = len(transform_direction(&ob_tfm, D));
+#  endif
+    }
+
+    if (bounce == 0) {
+      /* Check if we hit the opposite side. */
+      if (hit) {
+        have_opposite_interface = true;
+        opposite_distance = dot(ray.P + ray.t * ray.D - P, -N);
+      }
+      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
+       * so check if there would have been a hit in that case. */
+      hit = ray.t < t;
+    }
+
+    /* Use the distance to the exit point for the throughput update if we found one. */
+    if (hit) {
+      t = ray.t;
+    }
+    else if (bounce == 0) {
+      /* Restore original position if nothing was hit after the first bounce,
+       * without the ray_offset() that was added to avoid self-intersection.
+       * Otherwise if that offset is relatively large compared to the scattering
+       * radius, we never go back up high enough to exit the surface. */
+      ray.P = P;
+    }
+
+    /* Advance to new scatter location. */
+    ray.P += t * ray.D;
+
+    float3 transmittance;
+    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
+    if (bounce > 0) {
+      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
+      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
+
+      if (have_opposite_interface) {
+        /* First step of MIS: Depending on geometry we might have two methods for guided
+         * sampling, so perform MIS between them. */
+        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
+        guided_pdf = mix(
+            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
+      }
+      else {
+        /* Just include phase sampling factor otherwise. */
+        guided_pdf *= forward_pdf_factor;
+      }
+
+      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
+      pdf = mix(pdf, guided_pdf, guided_fraction);
+    }
+
+    /* Finally, we're applying MIS again to combine the three color channels.
+     * Altogether, the MIS computation combines up to nine different estimators:
+     * {classic, guided, backward_guided} x {r, g, b} */
+    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
+
+    if (hit) {
+      /* If we hit the surface, we are done. */
+      break;
+    }
+    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
+      /* Avoid unnecessary work and precision issue when throughput gets really small. */
+      break;
+    }
+  }
+
+  if (hit) {
+    kernel_assert(isfinite3_safe(throughput));
+    INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+  }
+
+  return hit;
+}
+
+ccl_device_inline bool subsurface_scatter(INTEGRATOR_STATE_ARGS)
+{
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  Ray ray ccl_optional_struct_init;
+  LocalIntersection ss_isect ccl_optional_struct_init;
+
+  if (!subsurface_random_walk(INTEGRATOR_STATE_PASS, rng_state, ray, ss_isect)) {
+    return false;
+  }
+
+#  ifdef __VOLUME__
+  /* Update volume stack if needed. */
+  if (kernel_data.integrator.use_volumes) {
+    const int object = intersection_get_object(kg, &ss_isect.hits[0]);
+    const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+    if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) {
+      float3 P = INTEGRATOR_STATE(ray, P);
+      const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+      const float3 offset_P = ray_offset(P, -Ng);
+
+      integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_PASS, offset_P, ray.P);
+    }
+  }
+#  endif /* __VOLUME__ */
+
+  /* Pretend ray is coming from the outside towards the exit point. This ensures
+   * correct front/back facing normals.
+   * TODO: find a more elegant solution? */
+  ray.P += ray.D * ray.t * 2.0f;
+  ray.D = -ray.D;
+
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &ss_isect.hits[0]);
+  integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Advanced random number offset for bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
+
+  const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
+  const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+  if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                shader);
+  }
+
+  return true;
+}
+
+#endif /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h
new file mode 100644
index 00000000000..d53070095f0
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Volume Stack
+ *
+ * This is an array of object/shared ID's that the current segment of the path
+ * is inside of. */
+
+template<typename StackReadOp, typename StackWriteOp>
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS,
+                                        const ShaderData *sd,
+                                        StackReadOp stack_read,
+                                        StackWriteOp stack_write)
+{
+  /* todo: we should have some way for objects to indicate if they want the
+   * world shader to work inside them. excluding it by default is problematic
+   * because non-volume objects can't be assumed to be closed manifolds */
+  if (!(sd->flag & SD_HAS_VOLUME)) {
+    return;
+  }
+
+  if (sd->flag & SD_BACKFACING) {
+    /* Exit volume object: remove from stack. */
+    for (int i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      if (entry.object == sd->object) {
+        /* Shift back next stack entries. */
+        do {
+          entry = stack_read(i + 1);
+          stack_write(i, entry);
+          i++;
+        } while (entry.shader != SHADER_NONE);
+
+        return;
+      }
+    }
+  }
+  else {
+    /* Enter volume object: add to stack. */
+    int i;
+    for (i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      /* Already in the stack? then we have nothing to do. */
+      if (entry.object == sd->object) {
+        return;
+      }
+    }
+
+    /* If we exceed the stack limit, ignore. */
+    if (i >= VOLUME_STACK_SIZE - 1) {
+      return;
+    }
+
+    /* Add to the end of the stack. */
+    const VolumeStack new_entry = {sd->object, sd->shader};
+    const VolumeStack empty_entry = {OBJECT_NONE, SHADER_NONE};
+    stack_write(i, new_entry);
+    stack_write(i + 1, empty_entry);
+  }
+}
+
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) { return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+ccl_device void shadow_volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) {
+        return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+      },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+/* Clean stack after the last bounce.
+ *
+ * It is expected that all volumes are closed manifolds, so at the time when ray
+ * hits nothing (for example, it is a last bounce which goes to environment) the
+ * only expected volume in the stack is the world's one. All the rest volume
+ * entries should have been exited already.
+ *
+ * This isn't always true because of ray intersection precision issues, which
+ * could lead us to an infinite non-world volume in the stack, causing render
+ * artifacts.
+ *
+ * Use this function after the last bounce to get rid of all volumes apart from
+ * the world's one after the last bounce to avoid render artifacts.
+ */
+ccl_device_inline void volume_stack_clean(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    /* Keep the world's volume in stack. */
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
+  }
+  else {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = SHADER_NONE;
+  }
+}
+
+template<typename StackReadOp>
+ccl_device float volume_stack_step_size(INTEGRATOR_STATE_ARGS, StackReadOp stack_read)
+{
+  float step_size = FLT_MAX;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    bool heterogeneous = false;
+
+    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
+      heterogeneous = true;
+    }
+    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
+      /* We want to render world or objects without any volume grids
+       * as homogeneous, but can only verify this at run-time since other
+       * heterogeneous volume objects may be using the same shader. */
+      int object = entry.object;
+      if (object != OBJECT_NONE) {
+        int object_flag = kernel_tex_fetch(__object_flag, object);
+        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
+          heterogeneous = true;
+        }
+      }
+    }
+
+    if (heterogeneous) {
+      float object_step_size = object_volume_step_size(kg, entry.object);
+      object_step_size *= kernel_data.integrator.volume_step_rate;
+      step_size = fminf(object_step_size, step_size);
+    }
+  }
+
+  return step_size;
+}
+
+typedef enum VolumeSampleMethod {
+  VOLUME_SAMPLE_NONE = 0,
+  VOLUME_SAMPLE_DISTANCE = (1 << 0),
+  VOLUME_SAMPLE_EQUIANGULAR = (1 << 1),
+  VOLUME_SAMPLE_MIS = (VOLUME_SAMPLE_DISTANCE | VOLUME_SAMPLE_EQUIANGULAR),
+} VolumeSampleMethod;
+
+ccl_device VolumeSampleMethod volume_stack_sample_method(INTEGRATOR_STATE_ARGS)
+{
+  VolumeSampleMethod method = VOLUME_SAMPLE_NONE;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    if (shader_flag & SD_VOLUME_MIS) {
+      /* Multiple importance sampling. */
+      return VOLUME_SAMPLE_MIS;
+    }
+    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_DISTANCE) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Only equiangular sampling. */
+      method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+    else {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_EQUIANGULAR) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Distance sampling only. */
+      method = VOLUME_SAMPLE_DISTANCE;
+    }
+  }
+
+  return method;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 61653d328f1..9e12d24dcf4 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -14,751 +14,501 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_adaptive_sampling.h"
+#include "kernel_random.h"
+#include "kernel_shadow_catcher.h"
+#include "kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* BSDF Eval
+/* --------------------------------------------------------------------
+ * BSDF Evaluation
  *
- * BSDF evaluation result, split per BSDF type. This is used to accumulate
- * render passes separately. */
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd);
+ * BSDF evaluation result, split between diffuse and glossy. This is used to
+ * accumulate render passes separately. Note that reflection, transmission
+ * and volume scattering are written to different render passes, but we assume
+ * that only one of those can happen at a bounce, and so do not need to accumulate
+ * them separately. */
 
-ccl_device_inline void bsdf_eval_init(BsdfEval *eval,
-                                      ClosureType type,
-                                      float3 value,
-                                      int use_light_pass)
+ccl_device_inline void bsdf_eval_init(BsdfEval *eval, const bool is_diffuse, float3 value)
 {
-#ifdef __PASSES__
-  eval->use_light_pass = use_light_pass;
-
-  if (eval->use_light_pass) {
-    eval->diffuse = zero_float3();
-    eval->glossy = zero_float3();
-    eval->transmission = zero_float3();
-    eval->transparent = zero_float3();
-    eval->volume = zero_float3();
-
-    if (type == CLOSURE_BSDF_TRANSPARENT_ID)
-      eval->transparent = value;
-    else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse = value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy = value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission = value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume = value;
-  }
-  else
-#endif
-  {
+  eval->diffuse = zero_float3();
+  eval->glossy = zero_float3();
+
+  if (is_diffuse) {
     eval->diffuse = value;
   }
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis = zero_float3();
-#endif
+  else {
+    eval->glossy = value;
+  }
 }
 
 ccl_device_inline void bsdf_eval_accum(BsdfEval *eval,
-                                       ClosureType type,
+                                       const bool is_diffuse,
                                        float3 value,
                                        float mis_weight)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis += value;
-#endif
   value *= mis_weight;
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse += value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy += value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission += value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume += value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse += value;
-  }
-}
 
-ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
-{
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) &&
-           is_zero(eval->transparent) && is_zero(eval->volume);
+  if (is_diffuse) {
+    eval->diffuse += value;
   }
-  else
-#endif
-  {
-    return is_zero(eval->diffuse);
+  else {
+    eval->glossy += value;
   }
 }
 
-ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
+ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse *= value;
-  }
+  return is_zero(eval->diffuse) && is_zero(eval->glossy);
 }
 
 ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-  bsdf_eval_mis(eval, value);
+  eval->diffuse *= value;
+  eval->glossy *= value;
 }
 
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-    eval->diffuse *= value;
-#else
   eval->diffuse *= value;
-#endif
+  eval->glossy *= value;
 }
 
 ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return eval->diffuse + eval->glossy + eval->transmission + eval->volume;
-  }
-  else
-#endif
-    return eval->diffuse;
+  return eval->diffuse + eval->glossy;
 }
 
-/* Path Radiance
- *
- * We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definite directly
- * visible as the first non-transparent hit, while indirectly visible are the
- * bounces after that. */
-
-ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(const BsdfEval *eval)
 {
-  /* clear all */
-#ifdef __PASSES__
-  L->use_light_pass = kernel_data.film.use_light_pass;
-
-  if (kernel_data.film.use_light_pass) {
-    L->indirect = zero_float3();
-    L->direct_emission = zero_float3();
-
-    L->color_diffuse = zero_float3();
-    L->color_glossy = zero_float3();
-    L->color_transmission = zero_float3();
-
-    L->direct_diffuse = zero_float3();
-    L->direct_glossy = zero_float3();
-    L->direct_transmission = zero_float3();
-    L->direct_volume = zero_float3();
-
-    L->indirect_diffuse = zero_float3();
-    L->indirect_glossy = zero_float3();
-    L->indirect_transmission = zero_float3();
-    L->indirect_volume = zero_float3();
-
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-    L->background = zero_float3();
-    L->ao = zero_float3();
-    L->shadow = zero_float3();
-    L->mist = 0.0f;
-
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
-    L->state.direct = zero_float3();
-  }
-  else
-#endif
-  {
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-  }
-
-#ifdef __SHADOW_TRICKS__
-  L->path_total = zero_float3();
-  L->path_total_shaded = zero_float3();
-  L->shadow_background_color = zero_float3();
-  L->shadow_throughput = 0.0f;
-  L->shadow_transparency = 1.0f;
-  L->has_shadow_catcher = 0;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  L->denoising_normal = zero_float3();
-  L->denoising_albedo = zero_float3();
-  L->denoising_depth = 0.0f;
-#endif
+  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
+   * We assume reflection, transmission and volume scatter to be exclusive. */
+  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
-                                                 PathRadianceState *L_state,
-                                                 ccl_addr_space float3 *throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float bsdf_pdf,
-                                                 int bounce,
-                                                 int bsdf_label)
-{
-  float inverse_pdf = 1.0f / bsdf_pdf;
-
-#ifdef __PASSES__
-  if (kernel_data.film.use_light_pass) {
-    if (bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
-      /* first on directly visible surface */
-      float3 value = *throughput * inverse_pdf;
-
-      L_state->diffuse = bsdf_eval->diffuse * value;
-      L_state->glossy = bsdf_eval->glossy * value;
-      L_state->transmission = bsdf_eval->transmission * value;
-      L_state->volume = bsdf_eval->volume * value;
-
-      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume;
+/* --------------------------------------------------------------------
+ * Clamping
+ *
+ * Clamping is done on a per-contribution basis so that we can write directly
+ * to render buffers instead of using per-thread memory, and to avoid the
+ * impact of clamping on other contributions. */
 
-      L_state->direct = *throughput;
-    }
-    else {
-      /* transparent bounce before first hit, or indirectly visible through BSDF */
-      float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf;
-      *throughput *= sum;
-    }
+ccl_device_forceinline void kernel_accum_clamp(const KernelGlobals *kg, float3 *L, int bounce)
+{
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(*L)) {
+    kernel_assert(!"Cycles sample with non-finite value detected");
   }
-  else
 #endif
-  {
-    *throughput *= bsdf_eval->diffuse * inverse_pdf;
-  }
-}
+  /* Make sure all components are finite, allowing the contribution to be usable by adaptive
+   * sampling convergence check, but also to make it so render result never causes issues with
+   * post-processing. */
+  *L = ensure_finite3(*L);
 
 #ifdef __CLAMP_SAMPLE__
-ccl_device_forceinline void path_radiance_clamp(KernelGlobals *kg, float3 *L, int bounce)
-{
   float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
                                kernel_data.integrator.sample_clamp_direct;
   float sum = reduce_add(fabs(*L));
   if (sum > limit) {
     *L *= limit / sum;
   }
+#endif
 }
 
-ccl_device_forceinline void path_radiance_clamp_throughput(KernelGlobals *kg,
-                                                           float3 *L,
-                                                           float3 *throughput,
-                                                           int bounce)
-{
-  float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
-                               kernel_data.integrator.sample_clamp_direct;
+/* --------------------------------------------------------------------
+ * Pass accumulation utilities.
+ */
 
-  float sum = reduce_add(fabs(*L));
-  if (sum > limit) {
-    float clamp_factor = limit / sum;
-    *L *= clamp_factor;
-    *throughput *= clamp_factor;
-  }
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-#endif
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
 
-ccl_device_inline void path_radiance_accum_emission(KernelGlobals *kg,
-                                                    PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 value)
+ccl_device_inline int kernel_accum_sample(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer,
+                                          int sample)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    return;
+  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
+    return sample;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0)
-      L->emission += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
-  }
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
 }
 
-ccl_device_inline void path_radiance_accum_ao(KernelGlobals *kg,
-                                              PathRadiance *L,
-                                              ccl_addr_space PathState *state,
-                                              float3 throughput,
-                                              float3 alpha,
-                                              float3 bsdf,
-                                              float3 ao)
+ccl_device void kernel_accum_adaptive_buffer(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 contribution,
+                                             ccl_global float *ccl_restrict buffer)
 {
-#ifdef __PASSES__
-  /* Store AO pass. */
-  if (L->use_light_pass && state->bounce == 0) {
-    L->ao += alpha * throughput * ao;
-  }
-#endif
-
-#ifdef __SHADOW_TRICKS__
-  /* For shadow catcher, accumulate ratio. */
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf;
-    L->path_total += light;
-    L->path_total_shaded += ao * light;
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+   * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+   * Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+   * tiles. */
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return;
   }
-#endif
-
-  float3 contribution = throughput * bsdf * ao;
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0) {
-      /* Directly visible lighting. */
-      L->direct_diffuse += contribution;
-    }
-    else {
-      /* Indirectly visible lighting after BSDF bounce. */
-      L->indirect += contribution;
-    }
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  const int sample = INTEGRATOR_STATE(path, sample);
+  if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_adaptive_aux_buffer,
+        make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f));
   }
 }
 
-ccl_device_inline void path_radiance_accum_total_ao(PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 bsdf)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf;
-#endif
-}
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+#ifdef __SHADOW_CATCHER__
 
-ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg,
-                                                 PathRadiance *L,
-                                                 ccl_addr_space PathState *state,
-                                                 float3 throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float3 shadow,
-                                                 float shadow_fac,
-                                                 bool is_lamp)
+/* Accumulate contribution to the Shadow Catcher pass.
+ *
+ * Returns truth if the contribution is fully handled here and is not to be added to the other
+ * passes (like combined, adaptive sampling). */
+
+ccl_device bool kernel_accum_shadow_catcher(INTEGRATOR_STATE_CONST_ARGS,
+                                            const float3 contribution,
+                                            ccl_global float *ccl_restrict buffer)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf_eval->sum_no_mis;
-    L->path_total += light;
-    L->path_total_shaded += shadow * light;
-
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
   }
-#endif
 
-  float3 shaded_throughput = throughput * shadow;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    /* Compute the clamping based on the total contribution.
-     * The resulting scale is then be applied to all individual components. */
-    float3 full_contribution = shaded_throughput * bsdf_eval_sum(bsdf_eval);
-#  ifdef __CLAMP_SAMPLE__
-    path_radiance_clamp_throughput(kg, &full_contribution, &shaded_throughput, state->bounce);
-#  endif
-
-    if (state->bounce == 0) {
-      /* directly visible lighting */
-      L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse;
-      L->direct_glossy += shaded_throughput * bsdf_eval->glossy;
-      L->direct_transmission += shaded_throughput * bsdf_eval->transmission;
-      L->direct_volume += shaded_throughput * bsdf_eval->volume;
-
-      if (is_lamp) {
-        L->shadow += shadow * shadow_fac;
-      }
-    }
-    else {
-      /* indirectly visible lighting after BSDF bounce */
-      L->indirect += full_contribution;
-    }
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
   }
-  else
-#endif
-  {
-    float3 contribution = shaded_throughput * bsdf_eval->diffuse;
-    path_radiance_clamp(kg, &contribution, state->bounce);
-    L->emission += contribution;
+
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
-}
 
-ccl_device_inline void path_radiance_accum_total_light(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput,
-                                                       const BsdfEval *bsdf_eval)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf_eval->sum_no_mis;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf_eval;
-#endif
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg,
-                                                      PathRadiance *L,
-                                                      ccl_addr_space PathState *state,
-                                                      float3 throughput,
-                                                      float3 value)
+ccl_device bool kernel_accum_shadow_catcher_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                        const float3 contribution,
+                                                        const float transparent,
+                                                        ccl_global float *ccl_restrict buffer)
 {
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
 
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * value;
-    L->path_total_shaded += throughput * value * L->shadow_transparency;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    return true;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_shadow_catcher_matte,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
+  }
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)
-      L->background += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the
+     * calculation and the alpha channel of the pass contains numbers of samples contributed to a
+     * pixel of the pass. */
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
 
-#ifdef __DENOISING_FEATURES__
-  L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput *
-                         value;
-#endif /* __DENOISING_FEATURES__ */
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_transparent(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput)
+ccl_device void kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_CONST_ARGS,
+                                                             const float transparent,
+                                                             ccl_global float *ccl_restrict buffer)
 {
-  L->transparent += average(throughput);
-}
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_accum_shadowcatcher(PathRadiance *L,
-                                                         float3 throughput,
-                                                         float3 background)
-{
-  L->shadow_throughput += average(throughput);
-  L->shadow_background_color += throughput * background;
-  L->has_shadow_catcher = 1;
-}
-#endif
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  /* this division is a bit ugly, but means we only have to keep track of
-   * only a single throughput further along the path, here we recover just
-   * the indirect path that is not influenced by any particular BSDF type */
-  if (L->use_light_pass) {
-    L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
-    L->direct_diffuse += L->state.diffuse * L->direct_emission;
-    L->direct_glossy += L->state.glossy * L->direct_emission;
-    L->direct_transmission += L->state.transmission * L->direct_emission;
-    L->direct_volume += L->state.volume * L->direct_emission;
-
-    L->indirect = safe_divide_color(L->indirect, L->state.direct);
-    L->indirect_diffuse += L->state.diffuse * L->indirect;
-    L->indirect_glossy += L->state.glossy * L->indirect;
-    L->indirect_transmission += L->state.transmission * L->indirect;
-    L->indirect_volume += L->state.volume * L->indirect;
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
   }
-#endif
 }
 
-ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
+#endif /* __SHADOW_CATCHER__ */
+
+/* --------------------------------------------------------------------
+ * Render passes.
+ */
 
-    L->direct_emission = zero_float3();
-    L->indirect = zero_float3();
+/* Write combined pass. */
+ccl_device_inline void kernel_accum_combined_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                  const float3 contribution,
+                                                  ccl_global float *ccl_restrict buffer)
+{
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher(INTEGRATOR_STATE_PASS, contribution, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution);
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, const PathRadiance *L_src)
+/* Write combined pass with transparency. */
+ccl_device_inline void kernel_accum_combined_transparent_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                              const float3 contribution,
+                                                              const float transparent,
+                                                              ccl_global float *ccl_restrict
+                                                                  buffer)
 {
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state = L_src->state;
-
-    L->direct_emission = L_src->direct_emission;
-    L->indirect = L_src->indirect;
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher_transparent(
+          INTEGRATOR_STATE_PASS, contribution, transparent, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_combined,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
-                                                       PathRadiance *L,
-                                                       float3 *L_sum,
-                                                       float *alpha)
+/* Write background or emission to appropriate pass. */
+ccl_device_inline void kernel_accum_emission_or_background_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                                float3 contribution,
+                                                                ccl_global float *ccl_restrict
+                                                                    buffer,
+                                                                const int pass)
 {
-  /* Calculate current shadow of the path. */
-  float path_total = average(L->path_total);
-  float shadow;
+  if (!(kernel_data.film.light_pass_flag & PASS_ANY)) {
+    return;
+  }
 
-  if (UNLIKELY(!isfinite_safe(path_total))) {
-#  ifdef __KERNEL_DEBUG_NAN__
-    kernel_assert(!"Non-finite total radiance along the path");
-#  endif
-    shadow = 0.0f;
+#ifdef __PASSES__
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  int pass_offset = PASS_UNUSED;
+
+  /* Denoising albedo. */
+#  ifdef __DENOISING_FEATURES__
+  if (path_flag & PATH_RAY_DENOISING_FEATURES) {
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = denoising_feature_throughput * contribution;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
   }
-  else if (path_total == 0.0f) {
-    shadow = L->shadow_transparency;
+#  endif /* __DENOISING_FEATURES__ */
+
+  if (!(path_flag & PATH_RAY_ANY_PASS)) {
+    /* Directly visible, write to emission or background pass. */
+    pass_offset = pass;
+  }
+  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+    /* Indirectly visible through reflection. */
+    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect) :
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_transmission_direct :
+                                            kernel_data.film.pass_transmission_indirect);
+
+    if (glossy_pass_offset != PASS_UNUSED) {
+      /* Glossy is a subset of the throughput, reconstruct it here using the
+       * diffuse-glossy ratio. */
+      const float3 ratio = INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+    }
+
+    /* Reconstruct diffuse subset of throughput. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_diffuse_direct :
+                                                          kernel_data.film.pass_diffuse_indirect;
+    if (pass_offset != PASS_UNUSED) {
+      contribution *= INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    }
   }
-  else {
-    float path_total_shaded = average(L->path_total_shaded);
-    shadow = path_total_shaded / path_total;
+  else if (path_flag & PATH_RAY_VOLUME_PASS) {
+    /* Indirectly visible through volume. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_volume_direct :
+                                                          kernel_data.film.pass_volume_indirect;
   }
 
-  /* Calculate final light sum and transparency for shadow catcher object. */
-  if (kernel_data.background.transparent) {
-    *alpha -= L->shadow_throughput * shadow;
-  }
-  else {
-    L->shadow_background_color *= shadow;
-    *L_sum += L->shadow_background_color;
+  /* Single write call for GPU coherence. */
+  if (pass_offset != PASS_UNUSED) {
+    kernel_write_pass_float3(buffer + pass_offset, contribution);
   }
+#endif /* __PASSES__ */
 }
-#endif
 
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float *alpha)
+/* Write light contribution to render buffer. */
+ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer)
 {
-  float3 L_sum;
-  /* Light Passes are used */
+  /* The throughput for shadow paths already contains the light shader evaluation. */
+  float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
+
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
+
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+
 #ifdef __PASSES__
-  float3 L_direct, L_indirect;
-  if (L->use_light_pass) {
-    path_radiance_sum_indirect(L);
-
-    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume +
-               L->emission;
-    L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission +
-                 L->indirect_volume;
-
-    if (!kernel_data.background.transparent)
-      L_direct += L->background;
-
-    L_sum = L_direct + L_indirect;
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-
-    /* Reject invalid value */
-    if (!isfinite_safe(sum)) {
-#  ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
-#  endif
-      L_sum = zero_float3();
-
-      L->direct_diffuse = zero_float3();
-      L->direct_glossy = zero_float3();
-      L->direct_transmission = zero_float3();
-      L->direct_volume = zero_float3();
-
-      L->indirect_diffuse = zero_float3();
-      L->indirect_glossy = zero_float3();
-      L->indirect_transmission = zero_float3();
-      L->indirect_volume = zero_float3();
-
-      L->emission = zero_float3();
+  if (kernel_data.film.light_pass_flag & PASS_ANY) {
+    const int path_flag = INTEGRATOR_STATE(shadow_path, flag);
+    int pass_offset = PASS_UNUSED;
+
+    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+      /* Indirectly visible through reflection. */
+      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_glossy_direct :
+                                              kernel_data.film.pass_glossy_indirect) :
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_transmission_direct :
+                                              kernel_data.film.pass_transmission_indirect);
+
+      if (glossy_pass_offset != PASS_UNUSED) {
+        /* Glossy is a subset of the throughput, reconstruct it here using the
+         * diffuse-glossy ratio. */
+        const float3 ratio = INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+      }
+
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
+      if (pass_offset != PASS_UNUSED) {
+        contribution *= INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+      }
+    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
     }
-  }
 
-  /* No Light Passes */
-  else
-#endif
-  {
-    L_sum = L->emission;
+    /* Single write call for GPU coherence. */
+    if (pass_offset != PASS_UNUSED) {
+      kernel_write_pass_float3(buffer + pass_offset, contribution);
+    }
 
-    /* Reject invalid value */
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-    if (!isfinite_safe(sum)) {
-#ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-#endif
-      L_sum = zero_float3();
+    /* Write shadow pass. */
+    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
+        (path_flag & PATH_RAY_CAMERA)) {
+      const float3 unshadowed_throughput = INTEGRATOR_STATE(shadow_path, unshadowed_throughput);
+      const float3 shadowed_throughput = INTEGRATOR_STATE(shadow_path, throughput);
+      const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) *
+                            kernel_data.film.pass_shadow_scale;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow);
     }
   }
+#endif
+}
 
-  /* Compute alpha. */
-  *alpha = 1.0f - L->transparent;
+/* Write transparency to render buffer.
+ *
+ * Note that we accumulate transparency = 1 - alpha in the render buffer.
+ * Otherwise we'd have to write alpha on path termination, which happens
+ * in many places. */
+ccl_device_inline void kernel_accum_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                const float transparent,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  /* Add shadow catcher contributions. */
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
   }
-#endif /* __SHADOW_TRICKS__ */
 
-  return L_sum;
+  kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_PASS, transparent, buffer);
 }
 
-ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float3 *noisy,
-                                                     float3 *clean)
+/* Write background contribution to render buffer.
+ *
+ * Includes transparency, matching kernel_accum_transparent. */
+ccl_device_inline void kernel_accum_background(INTEGRATOR_STATE_CONST_ARGS,
+                                               const float3 L,
+                                               const float transparent,
+                                               const bool is_transparent_background_ray,
+                                               ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __PASSES__
-  kernel_assert(L->use_light_pass);
-
-  *clean = L->emission + L->background;
-  *noisy = L->direct_volume + L->indirect_volume;
-
-#  define ADD_COMPONENT(flag, component) \
-    if (kernel_data.film.denoising_flags & flag) \
-      *clean += component; \
-    else \
-      *noisy += component;
-
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
-#  undef ADD_COMPONENT
-#else
-  *noisy = L->emission;
-  *clean = zero_float3();
-#endif
+  float3 contribution = INTEGRATOR_STATE(path, throughput) * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    *noisy += L->shadow_background_color;
-  }
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  *noisy = ensure_finite3(*noisy);
-  *clean = ensure_finite3(*clean);
+  if (is_transparent_background_ray) {
+    kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+  }
+  else {
+    kernel_accum_combined_transparent_pass(
+        INTEGRATOR_STATE_PASS, contribution, transparent, buffer);
+  }
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_background);
 }
 
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
+/* Write emission to render buffer. */
+ccl_device_inline void kernel_accum_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 throughput,
+                                             const float3 L,
+                                             ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __SPLIT_KERNEL__
-#  define safe_float3_add(f, v) \
-    do { \
-      ccl_global float *p = (ccl_global float *)(&(f)); \
-      atomic_add_and_fetch_float(p + 0, (v).x); \
-      atomic_add_and_fetch_float(p + 1, (v).y); \
-      atomic_add_and_fetch_float(p + 2, (v).z); \
-    } while (0)
-#  define safe_float_add(f, v) atomic_add_and_fetch_float(&(f), (v))
-#else
-#  define safe_float3_add(f, v) (f) += (v)
-#  define safe_float_add(f, v) (f) += (v)
-#endif /* __SPLIT_KERNEL__ */
+  float3 contribution = throughput * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __PASSES__
-  safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
-  safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
-  safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
-  safe_float3_add(L->direct_volume, L_sample->direct_volume);
-
-  safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
-  safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
-  safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
-  safe_float3_add(L->indirect_volume, L_sample->indirect_volume);
-
-  safe_float3_add(L->background, L_sample->background);
-  safe_float3_add(L->ao, L_sample->ao);
-  safe_float3_add(L->shadow, L_sample->shadow);
-  safe_float_add(L->mist, L_sample->mist);
-#endif /* __PASSES__ */
-  safe_float3_add(L->emission, L_sample->emission);
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#undef safe_float_add
-#undef safe_float3_add
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_emission);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
index 98b7bf7e7dc..2bee12f0473 100644
--- a/intern/cycles/kernel/kernel_adaptive_sampling.h
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -14,226 +14,146 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
-#define __KERNEL_ADAPTIVE_SAMPLING_H__
+#pragma once
+
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+/* Check whether the pixel has converged and should not be sampled anymore. */
 
-ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            int sample)
+ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS,
+                                                     ccl_global float *render_buffer)
 {
-  /* TODO Stefan: Is this better in linear, sRGB or something else? */
-  float4 I = *((ccl_global float4 *)buffer);
-  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-  /* The per pixel error as seen in section 2.1 of
-   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
-   * A small epsilon is added to the divisor to prevent division by zero. */
-  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
-                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
-  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
-    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
-    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return true;
   }
-}
-
-/* Adjust the values of an adaptively sampled pixel. */
-
-ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            float sample_multiplier)
-{
-  *(ccl_global float4 *)(buffer) *= sample_multiplier;
 
-  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
-  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
-  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
 
-#ifdef __PASSES__
-  int flag = kernel_data.film.pass_flag;
-
-  if (flag & PASSMASK(NORMAL))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  return buffer[aux_w_offset] == 0.0f;
+}
 
-  if (flag & PASSMASK(UV))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
 
-  if (flag & PASSMASK(MOTION)) {
-    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
-    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int y,
+                                                           float threshold,
+                                                           bool reset,
+                                                           int offset,
+                                                           int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+  const int render_pixel_index = offset + x + y * stride;
+  ccl_global float *buffer = render_buffer +
+                             (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+
+  /* TODO(Stefan): Is this better in linear, sRGB or something else? */
+
+  const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  if (!reset && A.w != 0.0f) {
+    /* If the pixel was considered converged, its state will not change in this kernmel. Early
+     * output before doing any math.
+     *
+     * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
+    return true;
   }
 
-  if (kernel_data.film.use_light_pass) {
-    int light_flag = kernel_data.film.light_pass_flag;
-
-    if (light_flag & PASSMASK(MIST))
-      *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
-
-    /* Shadow pass omitted on purpose. It has its own scale parameter. */
-
-    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(EMISSION))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
-    if (light_flag & PASSMASK(BACKGROUND))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
-    if (light_flag & PASSMASK(AO))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(DIFFUSE_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
-  }
-#endif
-
-#ifdef __DENOISING_FEATURES__
-
-#  define scale_float3_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale; \
-    *(buffer + offset + 3) *= scale * scale; \
-    *(buffer + offset + 4) *= scale * scale; \
-    *(buffer + offset + 5) *= scale * scale;
-
-#  define scale_shadow_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale * scale;
-
-  if (kernel_data.film.pass_denoising_data) {
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
-    if (kernel_data.film.pass_denoising_clean) {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
-    }
-    else {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-    }
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
-      1) *= sample_multiplier * sample_multiplier;
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Cryptomatte. */
-  if (kernel_data.film.cryptomatte_passes) {
-    int num_slots = 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
-    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
-    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
-                                                         kernel_data.film.pass_cryptomatte);
-    for (int slot = 0; slot < num_slots; slot++) {
-      id_buffer[slot].y *= sample_multiplier;
-    }
-  }
+  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
 
-  /* AOVs. */
-  for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) {
-    *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier;
-  }
-  for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) {
-    *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier;
-  }
+  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+  const float inv_sample = 1.0f / sample;
+
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
+  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
+                                 inv_sample;
+  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+  /* A small epsilon is added to the divisor to prevent division by zero. */
+  const float error = error_difference / (0.0001f + error_normalize);
+  const bool did_converge = (error < threshold);
+
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  buffer[aux_w_offset] = did_converge;
+
+  return did_converge;
 }
 
 /* This is a simple box filter in two passes.
  * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
 
-ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int y,
+                                                  int start_x,
+                                                  int width,
+                                                  int offset,
+                                                  int stride)
 {
-  bool any = false;
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  for (int x = tile->x; x < tile->x + tile->w; ++x) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (x > tile->x && !prev) {
+  for (int x = start_x; x < start_x + width; ++x) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (x > start_x && !prev) {
         index = index - 1;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
-ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int x,
+                                                  int start_y,
+                                                  int height,
+                                                  int offset,
+                                                  int stride)
 {
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  bool any = false;
-  for (int y = tile->y; y < tile->y + tile->h; ++y) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (y > tile->y && !prev) {
-        index = index - tile->stride;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+  for (int y = start_y; y < start_y + height; ++y) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (y > start_y && !prev) {
+        index = index - stride;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 7da890b908d..e025bcd6674 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -14,502 +14,62 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BAKING__
-
-ccl_device_noinline void compute_light_pass(
-    KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample)
-{
-  kernel_assert(kernel_data.film.use_light_pass);
-
-  float3 throughput = one_float3();
-
-  /* Emission and indirect shader data memory used by various functions. */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  /* Init radiance. */
-  path_radiance_init(kg, L);
-
-  /* Init path state. */
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL);
-
-  /* Evaluate surface shader. */
-  shader_eval_surface(kg, sd, &state, NULL, state.flag);
-
-  /* TODO: disable more closures we don't need besides transparent. */
-  shader_bsdf_disable_transparency(kg, sd);
-
-  /* Init ray. */
-  Ray ray;
-  ray.P = sd->P + sd->Ng;
-  ray.D = -sd->Ng;
-  ray.t = FLT_MAX;
-#  ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#  endif
-
-#  ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched) {
-    /* regular path tracer */
-#  endif
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd));
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-    bool is_sss_sample = false;
-
-#  ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      SubsurfaceIndirectRays ss_indirect;
-      kernel_path_subsurface_init_indirect(&ss_indirect);
-      if (kernel_path_subsurface_scatter(
-              kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) {
-        while (ss_indirect.num_rays) {
-          kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput);
-          kernel_path_indirect(
-              kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-        }
-        is_sss_sample = true;
-      }
-    }
-#  endif
-
-    /* sample light and BSDF */
-    if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-      kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L);
-
-      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) {
-#  ifdef __LAMP_MIS__
-        state.ray_t = 0.0f;
-#  endif
-        /* compute indirect light */
-        kernel_path_indirect(
-            kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-
-        /* sum and reset indirect light pass variables for the next samples */
-        path_radiance_sum_indirect(L);
-        path_radiance_reset_indirect(L);
-      }
-    }
-#  ifdef __BRANCHED_PATH__
-  }
-  else {
-    /* branched path tracer */
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput);
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-#    ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      kernel_branched_path_subsurface_scatter(
-          kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-    }
-#    endif
-
-    /* sample light and BSDF */
-    if (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT)) {
-#    if defined(__EMISSION__)
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = kernel_data.integrator.sample_all_lights_direct;
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, &state, throughput, 1.0f, L, all);
-      }
-#    endif
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L);
-    }
-  }
-#  endif
-}
-
-/* this helps with AA but it's not the real solution as it does not AA the geometry
- *  but it's better than nothing, thus committed */
-ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
-{
-  /* use mirror repeat (like opengl texture) so that if the barycentric
-   * coordinate goes past the end of the triangle it is not always clamped
-   * to the same value, gives ugly patterns */
-  u /= max;
-  float fu = floorf(u);
-  u = u - fu;
-
-  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
-}
-
-ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
-                                                 ShaderData *sd,
-                                                 const ShaderEvalType type)
-{
-  switch (type) {
-    case SHADER_EVAL_DIFFUSE:
-      return shader_bsdf_diffuse(kg, sd);
-    case SHADER_EVAL_GLOSSY:
-      return shader_bsdf_glossy(kg, sd);
-    case SHADER_EVAL_TRANSMISSION:
-      return shader_bsdf_transmission(kg, sd);
-    default:
-      kernel_assert(!"Unknown bake type passed to BSDF evaluate");
-      return zero_float3();
-  }
-}
-
-ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       PathState *state,
-                                                       float3 direct,
-                                                       float3 indirect,
-                                                       const ShaderEvalType type,
-                                                       const int pass_filter)
-{
-  float3 color;
-  const bool is_color = (pass_filter & BAKE_FILTER_COLOR) != 0;
-  const bool is_direct = (pass_filter & BAKE_FILTER_DIRECT) != 0;
-  const bool is_indirect = (pass_filter & BAKE_FILTER_INDIRECT) != 0;
-  float3 out = zero_float3();
-
-  if (is_color) {
-    if (is_direct || is_indirect) {
-      /* Leave direct and diffuse channel colored. */
-      color = one_float3();
-    }
-    else {
-      /* surface color of the pass only */
-      shader_eval_surface(kg, sd, state, NULL, 0);
-      return kernel_bake_shader_bsdf(kg, sd, type);
-    }
-  }
-  else {
-    shader_eval_surface(kg, sd, state, NULL, 0);
-    color = kernel_bake_shader_bsdf(kg, sd, type);
-  }
-
-  if (is_direct) {
-    out += safe_divide_even_color(direct, color);
-  }
-
-  if (is_indirect) {
-    out += safe_divide_even_color(indirect, color);
-  }
-
-  return out;
-}
-
-ccl_device void kernel_bake_evaluate(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* Setup render buffers. */
-  const int index = offset + x + y * stride;
-  const int pass_stride = kernel_data.film.pass_stride;
-  buffer += index * pass_stride;
-
-  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
-  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
-  ccl_global float *output = buffer + kernel_data.film.pass_combined;
-
-  int seed = __float_as_uint(primitive[0]);
-  int prim = __float_as_uint(primitive[1]);
-  if (prim == -1)
-    return;
-
-  prim += kernel_data.bake.tri_offset;
-
-  /* Random number generator. */
-  uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  float filter_x, filter_y;
-  if (sample == 0) {
-    filter_x = filter_y = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
-  }
-
-  /* Barycentric UV with sub-pixel offset. */
-  float u = primitive[2];
-  float v = primitive[3];
-
-  float dudx = differential[0];
-  float dudy = differential[1];
-  float dvdx = differential[2];
-  float dvdy = differential[3];
-
-  if (sample > 0) {
-    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
-    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
-                                 1.0f - u);
-  }
-
-  /* Shader data setup. */
-  int object = kernel_data.bake.object_index;
-  int shader;
-  float3 P, Ng;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  ShaderData sd;
-  shader_setup_from_sample(
-      kg,
-      &sd,
-      P,
-      Ng,
-      Ng,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      1.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-  sd.I = sd.N;
-
-  /* Setup differentials. */
-  sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
-  sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
-  sd.du.dx = dudx;
-  sd.du.dy = dudy;
-  sd.dv.dx = dvdx;
-  sd.dv.dy = dvdy;
-
-  /* Set RNG state for shaders that use sampling. */
-  PathState state = {0};
-  state.rng_hash = rng_hash;
-  state.rng_offset = 0;
-  state.sample = sample;
-  state.num_samples = num_samples;
-  state.min_ray_pdf = FLT_MAX;
-
-  /* Light passes if we need more than color. */
-  PathRadiance L;
-  int pass_filter = kernel_data.bake.pass_filter;
-
-  if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR)
-    compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
-
-  float3 out = zero_float3();
-
-  ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type;
-  switch (type) {
-    /* data passes */
-    case SHADER_EVAL_NORMAL:
-    case SHADER_EVAL_ROUGHNESS:
-    case SHADER_EVAL_EMISSION: {
-      if (type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) {
-        int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0;
-        shader_eval_surface(kg, &sd, &state, NULL, path_flag);
-      }
-
-      if (type == SHADER_EVAL_NORMAL) {
-        float3 N = sd.N;
-        if (sd.flag & SD_HAS_BUMP) {
-          N = shader_bsdf_average_normal(kg, &sd);
-        }
+#pragma once
 
-        /* encoding: normal = (2 * color) - 1 */
-        out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
-      }
-      else if (type == SHADER_EVAL_ROUGHNESS) {
-        float roughness = shader_bsdf_average_roughness(&sd);
-        out = make_float3(roughness, roughness, roughness);
-      }
-      else {
-        out = shader_emissive_eval(&sd);
-      }
-      break;
-    }
-    case SHADER_EVAL_UV: {
-      out = primitive_uv(kg, &sd);
-      break;
-    }
-#  ifdef __PASSES__
-    /* light passes */
-    case SHADER_EVAL_AO: {
-      out = L.ao;
-      break;
-    }
-    case SHADER_EVAL_COMBINED: {
-      if ((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
-        float alpha;
-        out = path_radiance_clamp_and_sum(kg, &L, &alpha);
-        break;
-      }
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
 
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_DIRECT) == BAKE_FILTER_DIFFUSE_DIRECT)
-        out += L.direct_diffuse;
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_INDIRECT) == BAKE_FILTER_DIFFUSE_INDIRECT)
-        out += L.indirect_diffuse;
+#include "kernel/geom/geom.h"
 
-      if ((pass_filter & BAKE_FILTER_GLOSSY_DIRECT) == BAKE_FILTER_GLOSSY_DIRECT)
-        out += L.direct_glossy;
-      if ((pass_filter & BAKE_FILTER_GLOSSY_INDIRECT) == BAKE_FILTER_GLOSSY_INDIRECT)
-        out += L.indirect_glossy;
-
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_DIRECT) == BAKE_FILTER_TRANSMISSION_DIRECT)
-        out += L.direct_transmission;
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT)
-        out += L.indirect_transmission;
-
-      if ((pass_filter & BAKE_FILTER_EMISSION) != 0)
-        out += L.emission;
-
-      break;
-    }
-    case SHADER_EVAL_SHADOW: {
-      out = L.shadow;
-      break;
-    }
-    case SHADER_EVAL_DIFFUSE: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_diffuse, L.indirect_diffuse, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_GLOSSY: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_glossy, L.indirect_glossy, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_TRANSMISSION: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter);
-      break;
-    }
-#  endif
-
-    /* extra */
-    case SHADER_EVAL_ENVIRONMENT: {
-      /* setup ray */
-      Ray ray;
-
-      ray.P = zero_float3();
-      ray.D = normalize(P);
-      ray.t = 0.0f;
-#  ifdef __CAMERA_MOTION__
-      ray.time = 0.5f;
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-      ray.dD = differential3_zero();
-      ray.dP = differential3_zero();
-#  endif
-
-      /* setup shader data */
-      shader_setup_from_background(kg, &sd, &ray);
-
-      /* evaluate */
-      int path_flag = 0; /* we can't know which type of BSDF this is for */
-      shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-      out = shader_background_eval(&sd);
-      break;
-    }
-    default: {
-      /* no real shader, returning the position of the verts for debugging */
-      out = normalize(P);
-      break;
-    }
-  }
-
-  /* write output */
-  const float4 result = make_float4(out.x, out.y, out.z, 1.0f);
-  kernel_write_pass_float4(output, result);
-}
-
-#endif /* __BAKING__ */
+CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
-                                         ccl_global uint4 *input,
+ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,
+                                         ccl_global const KernelShaderEvalInput *input,
                                          ccl_global float4 *output,
-                                         int i)
+                                         const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
+  /* Setup shader data. */
+  const KernelShaderEvalInput in = input[offset];
 
-  /* setup shader data */
-  int object = in.x;
-  int prim = in.y;
-  float u = __uint_as_float(in.z);
-  float v = __uint_as_float(in.w);
-
-  shader_setup_from_displace(kg, &sd, object, prim, u, v);
+  ShaderData sd;
+  shader_setup_from_displace(kg, &sd, in.object, in.prim, in.u, in.v);
 
-  /* evaluate */
-  float3 P = sd.P;
-  shader_eval_displacement(kg, &sd, &state);
+  /* Evaluate displacement shader. */
+  const float3 P = sd.P;
+  shader_eval_displacement(INTEGRATOR_STATE_PASS_NULL, &sd);
   float3 D = sd.P - P;
 
   object_inverse_dir_transform(kg, &sd, &D);
 
-  /* write output */
-  output[i] += make_float4(D.x, D.y, D.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
 }
 
-ccl_device void kernel_background_evaluate(KernelGlobals *kg,
-                                           ccl_global uint4 *input,
+ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
+                                           ccl_global const KernelShaderEvalInput *input,
                                            ccl_global float4 *output,
-                                           int i)
+                                           const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
-
-  /* setup ray */
-  Ray ray;
-  float u = __uint_as_float(in.x);
-  float v = __uint_as_float(in.y);
-
-  ray.P = zero_float3();
-  ray.D = equirectangular_to_direction(u, v);
-  ray.t = 0.0f;
-#ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#endif
+  /* Setup ray */
+  const KernelShaderEvalInput in = input[offset];
+  const float3 ray_P = zero_float3();
+  const float3 ray_D = equirectangular_to_direction(in.u, in.v);
+  const float ray_time = 0.5f;
 
-#ifdef __RAY_DIFFERENTIALS__
-  ray.dD = differential3_zero();
-  ray.dP = differential3_zero();
-#endif
-
-  /* setup shader data */
-  shader_setup_from_background(kg, &sd, &ray);
+  /* Setup shader data. */
+  ShaderData sd;
+  shader_setup_from_background(kg, &sd, ray_P, ray_D, ray_time);
 
-  /* evaluate */
-  int path_flag = 0; /* we can't know which type of BSDF this is for */
-  shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-  float3 color = shader_background_eval(&sd);
+  /* Evaluate shader.
+   * This is being evaluated for all BSDFs, so path flag does not contain a specific type. */
+  const int path_flag = PATH_RAY_EMISSION;
+  shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+      INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
+  const float3 color = shader_background_eval(&sd);
 
-  /* write output */
-  output[i] += make_float4(color.x, color.y, color.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 1bfac37158d..7be5da8fe6d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_differential.h"
+#include "kernel_lookup_table.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Perspective Camera */
@@ -39,7 +46,7 @@ ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u
   return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg,
+ccl_device void camera_sample_perspective(const KernelGlobals *ccl_restrict kg,
                                           float raster_x,
                                           float raster_y,
                                           float lens_u,
@@ -113,10 +120,14 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     float3 Dcenter = transform_direction(&cameratoworld, Pcamera);
-
-    ray->dP = differential3_zero();
-    ray->dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - normalize(Dcenter);
-    ray->dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - normalize(Dcenter);
+    float3 Dcenter_normalized = normalize(Dcenter);
+
+    /* TODO: can this be optimized to give compact differentials directly? */
+    ray->dP = differential_zero_compact();
+    differential3 dD;
+    dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - Dcenter_normalized;
+    dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - Dcenter_normalized;
+    ray->dD = differential_make_compact(dD);
 #endif
   }
   else {
@@ -143,8 +154,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dx = normalize(transform_direction(&cameratoworld, Dx));
     spherical_stereo_transform(&kernel_data.cam, &Px, &Dx);
 
-    ray->dP.dx = Px - Pcenter;
-    ray->dD.dx = Dx - Dcenter;
+    differential3 dP, dD;
+
+    dP.dx = Px - Pcenter;
+    dD.dx = Dx - Dcenter;
 
     float3 Py = Pnostereo;
     float3 Dy = transform_perspective(&rastertocamera,
@@ -152,8 +165,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dy = normalize(transform_direction(&cameratoworld, Dy));
     spherical_stereo_transform(&kernel_data.cam, &Py, &Dy);
 
-    ray->dP.dy = Py - Pcenter;
-    ray->dD.dy = Dy - Dcenter;
+    dP.dy = Py - Pcenter;
+    dD.dy = Dy - Dcenter;
+    ray->dD = differential_make_compact(dD);
+    ray->dP = differential_make_compact(dP);
 #endif
   }
 
@@ -162,8 +177,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
   float z_inv = 1.0f / normalize(Pcamera).z;
   float nearclip = kernel_data.cam.nearclip * z_inv;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = kernel_data.cam.cliplength * z_inv;
 #else
   ray->t = FLT_MAX;
@@ -171,7 +185,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 }
 
 /* Orthographic Camera */
-ccl_device void camera_sample_orthographic(KernelGlobals *kg,
+ccl_device void camera_sample_orthographic(const KernelGlobals *ccl_restrict kg,
                                            float raster_x,
                                            float raster_y,
                                            float lens_u,
@@ -220,10 +234,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
   /* ray differential */
-  ray->dP.dx = float4_to_float3(kernel_data.cam.dx);
-  ray->dP.dy = float4_to_float3(kernel_data.cam.dy);
+  differential3 dP;
+  dP.dx = float4_to_float3(kernel_data.cam.dx);
+  dP.dy = float4_to_float3(kernel_data.cam.dx);
 
-  ray->dD = differential3_zero();
+  ray->dP = differential_make_compact(dP);
+  ray->dD = differential_zero_compact();
 #endif
 
 #ifdef __CAMERA_CLIPPING__
@@ -323,8 +339,9 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Px, &Dx);
   }
 
-  ray->dP.dx = Px - Pcenter;
-  ray->dD.dx = Dx - Dcenter;
+  differential3 dP, dD;
+  dP.dx = Px - Pcenter;
+  dD.dx = Dx - Dcenter;
 
   float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
   float3 Dy = panorama_to_direction(cam, Py.x, Py.y);
@@ -334,16 +351,17 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Py, &Dy);
   }
 
-  ray->dP.dy = Py - Pcenter;
-  ray->dD.dy = Dy - Dcenter;
+  dP.dy = Py - Pcenter;
+  dD.dy = Dy - Dcenter;
+  ray->dD = differential_make_compact(dD);
+  ray->dP = differential_make_compact(dP);
 #endif
 
 #ifdef __CAMERA_CLIPPING__
   /* clipping */
   float nearclip = cam->nearclip;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = cam->cliplength;
 #else
   ray->t = FLT_MAX;
@@ -352,7 +370,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
 
 /* Common */
 
-ccl_device_inline void camera_sample(KernelGlobals *kg,
+ccl_device_inline void camera_sample(const KernelGlobals *ccl_restrict kg,
                                      int x,
                                      int y,
                                      float filter_u,
@@ -426,13 +444,13 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
 
 /* Utilities */
 
-ccl_device_inline float3 camera_position(KernelGlobals *kg)
+ccl_device_inline float3 camera_position(const KernelGlobals *kg)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   return make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
 }
 
-ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_distance(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
@@ -446,7 +464,7 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_z_depth(const KernelGlobals *kg, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     Transform worldtocamera = kernel_data.cam.worldtocamera;
@@ -459,7 +477,7 @@ ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
+ccl_device_inline float3 camera_direction_from_point(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
 
@@ -473,7 +491,7 @@ ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P
   }
 }
 
-ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
+ccl_device_inline float3 camera_world_to_ndc(const KernelGlobals *kg, ShaderData *sd, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     /* perspective / ortho */
diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h
index 5eb1bdad02e..960774e0741 100644
--- a/intern/cycles/kernel/kernel_color.h
+++ b/intern/cycles/kernel/kernel_color.h
@@ -14,25 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COLOR_H__
-#define __KERNEL_COLOR_H__
+#pragma once
 
 #include "util/util_color.h"
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 xyz_to_rgb(KernelGlobals *kg, float3 xyz)
+ccl_device float3 xyz_to_rgb(const KernelGlobals *kg, float3 xyz)
 {
   return make_float3(dot(float4_to_float3(kernel_data.film.xyz_to_r), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_g), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_b), xyz));
 }
 
-ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c)
+ccl_device float linear_rgb_to_gray(const KernelGlobals *kg, float3 c)
 {
   return dot(c, float4_to_float3(kernel_data.film.rgb_to_y));
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COLOR_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
deleted file mode 100644
index 4a9304a134c..00000000000
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_COMPAT_OPENCL_H__
-#define __KERNEL_COMPAT_OPENCL_H__
-
-#define __KERNEL_GPU__
-#define __KERNEL_OPENCL__
-
-/* no namespaces in opencl */
-#define CCL_NAMESPACE_BEGIN
-#define CCL_NAMESPACE_END
-
-#ifdef __CL_NOINLINE__
-#  define ccl_noinline __attribute__((noinline))
-#else
-#  define ccl_noinline
-#endif
-
-/* in opencl all functions are device functions, so leave this empty */
-#define ccl_device
-#define ccl_device_inline ccl_device
-#define ccl_device_forceinline ccl_device
-#define ccl_device_noinline ccl_device ccl_noinline
-#define ccl_device_noinline_cpu ccl_device
-#define ccl_may_alias
-#define ccl_static_constant static __constant
-#define ccl_constant __constant
-#define ccl_global __global
-#define ccl_local __local
-#define ccl_local_param __local
-#define ccl_private __private
-#define ccl_restrict restrict
-#define ccl_ref
-#define ccl_align(n) __attribute__((aligned(n)))
-#define ccl_optional_struct_init
-
-#if __OPENCL_VERSION__ >= 200 && !defined(__NV_CL_C_VERSION)
-#  define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1)))
-#else
-#  define ccl_loop_no_unroll
-#endif
-
-#ifdef __SPLIT_KERNEL__
-#  define ccl_addr_space __global
-#else
-#  define ccl_addr_space
-#endif
-
-#define ATTR_FALLTHROUGH
-
-#define ccl_local_id(d) get_local_id(d)
-#define ccl_global_id(d) get_global_id(d)
-
-#define ccl_local_size(d) get_local_size(d)
-#define ccl_global_size(d) get_global_size(d)
-
-#define ccl_group_id(d) get_group_id(d)
-#define ccl_num_groups(d) get_num_groups(d)
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
-/* no assert in opencl */
-#define kernel_assert(cond)
-
-/* make_type definitions with opencl style element initializers */
-#ifdef make_float2
-#  undef make_float2
-#endif
-#ifdef make_float3
-#  undef make_float3
-#endif
-#ifdef make_float4
-#  undef make_float4
-#endif
-#ifdef make_int2
-#  undef make_int2
-#endif
-#ifdef make_int3
-#  undef make_int3
-#endif
-#ifdef make_int4
-#  undef make_int4
-#endif
-#ifdef make_uchar4
-#  undef make_uchar4
-#endif
-
-#define make_float2(x, y) ((float2)(x, y))
-#define make_float3(x, y, z) ((float3)(x, y, z))
-#define make_float4(x, y, z, w) ((float4)(x, y, z, w))
-#define make_int2(x, y) ((int2)(x, y))
-#define make_int3(x, y, z) ((int3)(x, y, z))
-#define make_int4(x, y, z, w) ((int4)(x, y, z, w))
-#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w))
-
-/* math functions */
-#define __uint_as_float(x) as_float(x)
-#define __float_as_uint(x) as_uint(x)
-#define __int_as_float(x) as_float(x)
-#define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)(x)), ((float)(y)))
-#define fabsf(x) fabs(((float)(x)))
-#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
-#define asinf(x) asin(((float)(x)))
-#define acosf(x) acos(((float)(x)))
-#define atanf(x) atan(((float)(x)))
-#define floorf(x) floor(((float)(x)))
-#define ceilf(x) ceil(((float)(x)))
-#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
-#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
-#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
-#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
-#define fmodf(x, y) fmod((float)(x), (float)(y))
-#define sinhf(x) sinh(((float)(x)))
-#define coshf(x) cosh(((float)(x)))
-#define tanhf(x) tanh(((float)(x)))
-
-/* Use native functions with possibly lower precision for performance,
- * no issues found so far. */
-#if 1
-#  define sinf(x) native_sin(((float)(x)))
-#  define cosf(x) native_cos(((float)(x)))
-#  define tanf(x) native_tan(((float)(x)))
-#  define expf(x) native_exp(((float)(x)))
-#  define sqrtf(x) native_sqrt(((float)(x)))
-#  define logf(x) native_log(((float)(x)))
-#  define rcp(x) native_recip(x)
-#else
-#  define sinf(x) sin(((float)(x)))
-#  define cosf(x) cos(((float)(x)))
-#  define tanf(x) tan(((float)(x)))
-#  define expf(x) exp(((float)(x)))
-#  define sqrtf(x) sqrt(((float)(x)))
-#  define logf(x) log(((float)(x)))
-#  define rcp(x) recip(x)
-#endif
-
-/* data lookup defines */
-#define kernel_data (*kg->data)
-#define kernel_tex_array(tex) \
-  ((const ccl_global tex##_t *)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))
-#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)]
-
-/* define NULL */
-#ifndef NULL
-#  define NULL ((void *)0)
-#endif
-
-/* enable extensions */
-#ifdef __KERNEL_CL_KHR_FP16__
-#  pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-#include "util/util_half.h"
-#include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPENCL_H__ */
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index 3ec0cdbaccc..db4e110bd10 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -14,26 +14,28 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(ccl_addr_space differential3 *dP_,
-                                      const differential3 dP,
-                                      float3 D,
-                                      const differential3 dD,
-                                      float3 Ng,
-                                      float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *surface_dP,
+                                      const differential3 ray_dP,
+                                      float3 ray_D,
+                                      const differential3 ray_dD,
+                                      float3 surface_Ng,
+                                      float ray_t)
 {
   /* ray differential transfer through homogeneous medium, to
    * compute dPdx/dy at a shading point from the incoming ray */
 
-  float3 tmp = D / dot(D, Ng);
-  float3 tmpx = dP.dx + t * dD.dx;
-  float3 tmpy = dP.dy + t * dD.dy;
+  float3 tmp = ray_D / dot(ray_D, surface_Ng);
+  float3 tmpx = ray_dP.dx + ray_t * ray_dD.dx;
+  float3 tmpy = ray_dP.dy + ray_t * ray_dD.dy;
 
-  dP_->dx = tmpx - dot(tmpx, Ng) * tmp;
-  dP_->dy = tmpy - dot(tmpy, Ng) * tmp;
+  surface_dP->dx = tmpx - dot(tmpx, surface_Ng) * tmp;
+  surface_dP->dy = tmpy - dot(tmpy, surface_Ng) * tmp;
 }
 
 ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
@@ -112,4 +114,53 @@ ccl_device differential3 differential3_zero()
   return d;
 }
 
+/* Compact ray differentials that are just a scale to reduce memory usage and
+ * access cost in GPU.
+ *
+ * See above for more accurate reference implementations.
+ *
+ * TODO: also store the more compact version in ShaderData and recompute where
+ * needed? */
+
+ccl_device_forceinline float differential_zero_compact()
+{
+  return 0.0f;
+}
+
+ccl_device_forceinline float differential_make_compact(const differential3 D)
+{
+  return 0.5f * (len(D.dx) + len(D.dy));
+}
+
+ccl_device_forceinline void differential_transfer_compact(ccl_addr_space differential3 *surface_dP,
+                                                          const float ray_dP,
+                                                          const float3 /* ray_D */,
+                                                          const float ray_dD,
+                                                          const float3 surface_Ng,
+                                                          const float ray_t)
+{
+  /* ray differential transfer through homogeneous medium, to
+   * compute dPdx/dy at a shading point from the incoming ray */
+  float scale = ray_dP + ray_t * ray_dD;
+
+  float3 dx, dy;
+  make_orthonormals(surface_Ng, &dx, &dy);
+  surface_dP->dx = dx * scale;
+  surface_dP->dy = dy * scale;
+}
+
+ccl_device_forceinline void differential_incoming_compact(ccl_addr_space differential3 *dI,
+                                                          const float3 D,
+                                                          const float dD)
+{
+  /* compute dIdx/dy at a shading point, we just need to negate the
+   * differential of the ray direction */
+
+  float3 dx, dy;
+  make_orthonormals(D, &dx, &dy);
+
+  dI->dx = dD * dx;
+  dI->dy = dD * dy;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index aebf2ec8e28..d62285d173d 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -14,40 +14,36 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* Direction Emission */
-ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    LightSample *ls,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 I,
-                                                    differential3 dI,
-                                                    float t,
-                                                    float time)
+/* Evaluate shader on light. */
+ccl_device_noinline_cpu float3 light_sample_shader_eval(INTEGRATOR_STATE_ARGS,
+                                                        ShaderData *ccl_restrict emission_sd,
+                                                        LightSample *ccl_restrict ls,
+                                                        float time)
 {
   /* setup shading at emitter */
   float3 eval = zero_float3();
 
   if (shader_constant_emission_eval(kg, ls->shader, &eval)) {
-    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
+    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, ls->D) > 0.0f) {
       ls->Ng = -ls->Ng;
     }
   }
   else {
     /* Setup shader data and call shader_eval_surface once, better
      * for GPU coherence and compile times. */
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
 #ifdef __BACKGROUND_MIS__
     if (ls->type == LIGHT_BACKGROUND) {
-      Ray ray;
-      ray.D = ls->D;
-      ray.P = ls->P;
-      ray.t = 1.0f;
-      ray.time = time;
-      ray.dP = differential3_zero();
-      ray.dD = dI;
-
-      shader_setup_from_background(kg, emission_sd, &ray);
+      shader_setup_from_background(kg, emission_sd, ls->P, ls->D, time);
     }
     else
 #endif
@@ -56,13 +52,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
                                emission_sd,
                                ls->P,
                                ls->Ng,
-                               I,
+                               -ls->D,
                                ls->shader,
                                ls->object,
                                ls->prim,
                                ls->u,
                                ls->v,
-                               t,
+                               ls->t,
                                time,
                                false,
                                ls->lamp);
@@ -70,11 +66,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
       ls->Ng = emission_sd->Ng;
     }
 
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+
     /* No proper path flag, we're evaluating this for all closures. that's
      * weak but we'd have to do multiple evaluations otherwise. */
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, NULL, PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, NULL, PATH_RAY_EMISSION);
 
     /* Evaluate closures. */
 #ifdef __BACKGROUND_MIS__
@@ -98,85 +96,129 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
   return eval;
 }
 
-ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
-                                             ShaderData *sd,
-                                             ShaderData *emission_sd,
-                                             LightSample *ls,
-                                             ccl_addr_space PathState *state,
-                                             Ray *ray,
-                                             BsdfEval *eval,
-                                             bool *is_lamp,
-                                             float rand_terminate)
+/* Test if light sample is from a light or emission from geometry. */
+ccl_device_inline bool light_sample_is_light(const LightSample *ccl_restrict ls)
 {
-  if (ls->pdf == 0.0f)
-    return false;
-
-  /* todo: implement */
-  differential3 dD = differential3_zero();
+  /* return if it's a lamp for shadow pass */
+  return (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
+}
 
-  /* evaluate closure */
+/* Early path termination of shadow rays. */
+ccl_device_inline bool light_sample_terminate(const KernelGlobals *ccl_restrict kg,
+                                              const LightSample *ccl_restrict ls,
+                                              BsdfEval *ccl_restrict eval,
+                                              const float rand_terminate)
+{
+  if (bsdf_eval_is_zero(eval)) {
+    return true;
+  }
 
-  float3 light_eval = direct_emissive_eval(
-      kg, emission_sd, ls, state, -ls->D, dD, ls->t, sd->time);
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    float probability = max3(fabs(bsdf_eval_sum(eval))) *
+                        kernel_data.integrator.light_inv_rr_threshold;
+    if (probability < 1.0f) {
+      if (rand_terminate >= probability) {
+        return true;
+      }
+      bsdf_eval_mul(eval, 1.0f / probability);
+    }
+  }
 
-  if (is_zero(light_eval))
-    return false;
+  return false;
+}
 
-    /* evaluate BSDF at shading point */
+/* This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. The algorithm slightly distorts flat surface
+ * of a triangle. Surface is lifted by amount h along normal n in the incident
+ * point. */
 
-#ifdef __VOLUME__
-  if (sd->prim != PRIM_NONE)
-    shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
+ccl_device_inline float3 shadow_ray_smooth_surface_offset(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          float3 Ng)
+{
+  float3 V[3], N[3];
+  triangle_vertices_and_normals(kg, sd->prim, V, N);
+
+  const float u = sd->u, v = sd->v;
+  const float w = 1 - u - v;
+  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
+  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
+
+  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
+
+  /* Parabolic approximation */
+  float a = dot(N[2] - N[0], V[0] - V[2]);
+  float b = dot(N[2] - N[1], V[1] - V[2]);
+  float c = dot(N[1] - N[0], V[1] - V[0]);
+  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
+
+  /* Check flipped normals */
+  if (dot(n, Ng) > 0) {
+    /* Local linear envelope */
+    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
+    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
+    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
+    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
+    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
+    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
+    h = max(min(min(h0, h1), h2), h * 0.5f);
+  }
   else {
-    float bsdf_pdf;
-    shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
-    if (ls->shader & SHADER_USE_MIS) {
-      /* Multiple importance sampling. */
-      float mis_weight = power_heuristic(ls->pdf, bsdf_pdf);
-      light_eval *= mis_weight;
-    }
+    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
+    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
+    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
+    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
+    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
+    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
+    h = min(-min(min(h0, h1), h2), h * 0.5f);
   }
-#else
-  shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
-#endif
 
-  bsdf_eval_mul3(eval, light_eval / ls->pdf);
-
-#ifdef __PASSES__
-  /* use visibility flag to skip lights */
-  if (ls->shader & SHADER_EXCLUDE_ANY) {
-    if (ls->shader & SHADER_EXCLUDE_DIFFUSE)
-      eval->diffuse = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_GLOSSY)
-      eval->glossy = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_TRANSMIT)
-      eval->transmission = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_SCATTER)
-      eval->volume = zero_float3();
-  }
-#endif
+  return n * h;
+}
 
-  if (bsdf_eval_is_zero(eval))
-    return false;
+/* Ray offset to avoid shadow terminator artifact. */
 
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f
-#ifdef __SHADOW_TRICKS__
-      && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
-#endif
-  ) {
-    float probability = max3(fabs(bsdf_eval_sum(eval))) *
-                        kernel_data.integrator.light_inv_rr_threshold;
-    if (probability < 1.0f) {
-      if (rand_terminate >= probability) {
-        return false;
+ccl_device_inline float3 shadow_ray_offset(const KernelGlobals *ccl_restrict kg,
+                                           const ShaderData *ccl_restrict sd,
+                                           float3 L)
+{
+  float NL = dot(sd->N, L);
+  bool transmit = (NL < 0.0f);
+  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
+  float3 P = ray_offset(sd->P, Ng);
+
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
+    const float offset_cutoff =
+        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
+    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
+     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
+     * make a smooth transition near the threshold. */
+    if (offset_cutoff > 0.0f) {
+      float NgL = dot(Ng, L);
+      float offset_amount = 0.0f;
+      if (NL < offset_cutoff) {
+        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
+      }
+      else {
+        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
+      }
+      if (offset_amount > 0.0f) {
+        P += shadow_ray_smooth_surface_offset(kg, sd, Ng) * offset_amount;
       }
-      bsdf_eval_mul(eval, 1.0f / probability);
     }
   }
 
+  return P;
+}
+
+ccl_device_inline void shadow_ray_setup(const ShaderData *ccl_restrict sd,
+                                        const LightSample *ccl_restrict ls,
+                                        const float3 P,
+                                        Ray *ray)
+{
   if (ls->shader & SHADER_CAST_SHADOW) {
     /* setup ray */
-    ray->P = ray_offset_shadow(kg, sd, ls->D);
+    ray->P = P;
 
     if (ls->t == FLT_MAX) {
       /* distant light */
@@ -185,160 +227,40 @@ ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
     }
     else {
       /* other lights, avoid self-intersection */
-      ray->D = ray_offset(ls->P, ls->Ng) - ray->P;
+      ray->D = ray_offset(ls->P, ls->Ng) - P;
       ray->D = normalize_len(ray->D, &ray->t);
     }
-
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
   }
   else {
     /* signal to not cast shadow ray */
+    ray->P = zero_float3();
+    ray->D = zero_float3();
     ray->t = 0.0f;
   }
 
-  /* return if it's a lamp for shadow pass */
-  *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
-
-  return true;
+  ray->dP = differential_make_compact(sd->dP);
+  ray->dD = differential_zero_compact();
+  ray->time = sd->time;
 }
 
-/* Indirect Primitive Emission */
-
-ccl_device_noinline_cpu float3 indirect_primitive_emission(
-    KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_surface_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const LightSample *ccl_restrict ls,
+                                                          Ray *ray)
 {
-  /* evaluate emissive closure */
-  float3 L = shader_emissive_eval(sd);
-
-#ifdef __HAIR__
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
-      (sd->type & PRIMITIVE_ALL_TRIANGLE))
-#else
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
-#endif
-  {
-    /* multiple importance sampling, get triangle light pdf,
-     * and compute weight with respect to BSDF pdf */
-    float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
-    return L * mis_weight;
-  }
-
-  return L;
+  const float3 P = shadow_ray_offset(kg, sd, ls->D);
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
-/* Indirect Lamp Emission */
-
-ccl_device_noinline_cpu void indirect_lamp_emission(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadiance *L,
-                                                    Ray *ray,
-                                                    float3 throughput)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_volume_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                         const ShaderData *ccl_restrict sd,
+                                                         const LightSample *ccl_restrict ls,
+                                                         const float3 P,
+                                                         Ray *ray)
 {
-  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
-    LightSample ls ccl_optional_struct_init;
-
-    if (!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
-      continue;
-
-#ifdef __PASSES__
-    /* use visibility flag to skip lights */
-    if (ls.shader & SHADER_EXCLUDE_ANY) {
-      if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-          ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
-           ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-            (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-          ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-          ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-        continue;
-    }
-#endif
-
-    float3 lamp_L = direct_emissive_eval(
-        kg, emission_sd, &ls, state, -ray->D, ray->dD, ls.t, ray->time);
-
-#ifdef __VOLUME__
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      /* shadow attenuation */
-      Ray volume_ray = *ray;
-      volume_ray.t = ls.t;
-      float3 volume_tp = one_float3();
-      kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp);
-      lamp_L *= volume_tp;
-    }
-#endif
-
-    if (!(state->flag & PATH_RAY_MIS_SKIP)) {
-      /* multiple importance sampling, get regular light pdf,
-       * and compute weight with respect to BSDF pdf */
-      float mis_weight = power_heuristic(state->ray_pdf, ls.pdf);
-      lamp_L *= mis_weight;
-    }
-
-    path_radiance_accum_emission(kg, L, state, throughput, lamp_L);
-  }
-}
-
-/* Indirect Background */
-
-ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
-                                                   ShaderData *emission_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_global float *buffer,
-                                                   ccl_addr_space Ray *ray)
-{
-#ifdef __BACKGROUND__
-  int shader = kernel_data.background.surface_shader;
-
-  /* Use visibility flag to skip lights. */
-  if (shader & SHADER_EXCLUDE_ANY) {
-    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-        ((shader & SHADER_EXCLUDE_GLOSSY) &&
-         ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-        ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-        ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
-        ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-      return zero_float3();
-  }
-
-  /* Evaluate background shader. */
-  float3 L = zero_float3();
-  if (!shader_constant_emission_eval(kg, shader, &L)) {
-#  ifdef __SPLIT_KERNEL__
-    Ray priv_ray = *ray;
-    shader_setup_from_background(kg, emission_sd, &priv_ray);
-#  else
-    shader_setup_from_background(kg, emission_sd, ray);
-#  endif
-
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, buffer, state->flag | PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
-
-    L = shader_background_eval(emission_sd);
-  }
-
-  /* Background MIS weights. */
-#  ifdef __BACKGROUND_MIS__
-  /* Check if background light exists or if we should skip pdf. */
-  if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
-    /* multiple importance sampling, get background light pdf for ray
-     * direction, and compute weight with respect to BSDF pdf */
-    float pdf = background_light_pdf(kg, ray->P, ray->D);
-    float mis_weight = power_heuristic(state->ray_pdf, pdf);
-
-    return L * mis_weight;
-  }
-#  endif
-
-  return L;
-#else
-  return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index a6fd4f1dc7e..fa93f4830d1 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -14,119 +14,516 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 film_get_pass_result(KernelGlobals *kg,
-                                       ccl_global float *buffer,
-                                       float sample_scale,
-                                       int index,
-                                       bool use_display_sample_scale)
-{
-  float4 pass_result;
-
-  int display_pass_stride = kernel_data.film.display_pass_stride;
-  int display_pass_components = kernel_data.film.display_pass_components;
-
-  if (display_pass_components == 4) {
-    float4 in = *(ccl_global float4 *)(buffer + display_pass_stride +
-                                       index * kernel_data.film.pass_stride);
-    float alpha = use_display_sample_scale ?
-                      (kernel_data.film.use_display_pass_alpha ? in.w : 1.0f / sample_scale) :
-                      1.0f;
-
-    pass_result = make_float4(in.x, in.y, in.z, alpha);
-
-    int display_divide_pass_stride = kernel_data.film.display_divide_pass_stride;
-    if (display_divide_pass_stride != -1) {
-      ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride +
-                                                           index * kernel_data.film.pass_stride);
-      float3 divided = safe_divide_even_color(float4_to_float3(pass_result),
-                                              float4_to_float3(*divide_in));
-      pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w);
-    }
+/* --------------------------------------------------------------------
+ * Common utilities.
+ */
 
-    if (kernel_data.film.use_display_exposure) {
-      float exposure = kernel_data.film.exposure;
-      pass_result *= make_float4(exposure, exposure, exposure, 1.0f);
-    }
+/* The input buffer contains transparency = 1 - alpha, this converts it to
+ * alpha. Also clamp since alpha might end up outside of 0..1 due to Russian
+ * roulette. */
+ccl_device_forceinline float film_transparency_to_alpha(float transparency)
+{
+  return saturate(1.0f - transparency);
+}
+
+ccl_device_inline float film_get_scale(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+    return 1.0f / sample_count;
+  }
+
+  return 1.0f;
+}
+
+ccl_device_inline float film_get_scale_exposure(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale_exposure;
+  }
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  if (kfilm_convert->pass_use_exposure) {
+    return scale * kfilm_convert->exposure;
+  }
+
+  return scale;
+}
+
+ccl_device_inline bool film_get_scale_and_scale_exposure(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict scale,
+    float *ccl_restrict scale_exposure)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    *scale = kfilm_convert->scale;
+    *scale_exposure = kfilm_convert->scale_exposure;
+    return true;
+  }
+
+  const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+  if (!sample_count) {
+    *scale = 0.0f;
+    *scale_exposure = 0.0f;
+    return false;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    *scale = 1.0f / sample_count;
   }
-  else if (display_pass_components == 1) {
-    ccl_global float *in = (ccl_global float *)(buffer + display_pass_stride +
-                                                index * kernel_data.film.pass_stride);
-    pass_result = make_float4(*in, *in, *in, 1.0f / sample_scale);
+  else {
+    *scale = 1.0f;
+  }
+
+  if (kfilm_convert->pass_use_exposure) {
+    *scale_exposure = *scale * kfilm_convert->exposure;
+  }
+  else {
+    *scale_exposure = *scale;
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Float (scalar) passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_depth(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+}
+
+ccl_device_inline void film_get_pass_pixel_mist(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer,
+                                                float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  /* Note that we accumulate 1 - mist in the kernel to avoid having to
+   * track the mist values in the integrator state. */
+  pixel[0] = saturate(1.0f - f * scale_exposure);
+}
+
+ccl_device_inline void film_get_pass_pixel_sample_count(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see
+   * meaningful value when adaptive sampler stopped rendering image way before the maximum
+   * number of samples was reached (for examples when number of samples is set to 0 in
+   * viewport). */
+
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = __float_as_uint(f) * kfilm_convert->scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = f * scale_exposure;
+}
+
+/* --------------------------------------------------------------------
+ * Float 3 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_light_path(const KernelFilmConvert *ccl_restrict
+                                                          kfilm_convert,
+                                                      ccl_global const float *ccl_restrict buffer,
+                                                      float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  /* Read light pass. */
+  const float *in = buffer + kfilm_convert->pass_offset;
+  float3 f = make_float3(in[0], in[1], in[2]);
+
+  /* Optionally add indirect light pass. */
+  if (kfilm_convert->pass_indirect != PASS_UNUSED) {
+    const float *in_indirect = buffer + kfilm_convert->pass_indirect;
+    const float3 f_indirect = make_float3(in_indirect[0], in_indirect[1], in_indirect[2]);
+    f += f_indirect;
+  }
+
+  /* Optionally divide out color. */
+  if (kfilm_convert->pass_divide != PASS_UNUSED) {
+    const float *in_divide = buffer + kfilm_convert->pass_divide;
+    const float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
+    f = safe_divide_even_color(f, f_divide);
+
+    /* Exposure only, sample scale cancels out. */
+    f *= kfilm_convert->exposure;
+  }
+  else {
+    /* Sample scale and exposure. */
+    f *= film_get_scale_exposure(kfilm_convert, buffer);
+  }
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_float3(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 f = make_float3(in[0], in[1], in[2]) * scale_exposure;
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+/* --------------------------------------------------------------------
+ * Float4 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_motion(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_motion_weight != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float *in_weight = buffer + kfilm_convert->pass_motion_weight;
+
+  const float weight = in_weight[0];
+  const float weight_inv = (weight > 0.0f) ? 1.0f / weight : 0.0f;
+
+  const float4 motion = make_float4(in[0], in[1], in[2], in[3]) * weight_inv;
+
+  pixel[0] = motion.x;
+  pixel[1] = motion.y;
+  pixel[2] = motion.z;
+  pixel[3] = motion.w;
+}
+
+ccl_device_inline void film_get_pass_pixel_cryptomatte(const KernelFilmConvert *ccl_restrict
+                                                           kfilm_convert,
+                                                       ccl_global const float *ccl_restrict buffer,
+                                                       float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
+  /* x and z contain integer IDs, don't rescale them.
+   * y and w contain matte weights, they get scaled. */
+  pixel[0] = f.x;
+  pixel[1] = f.y * scale;
+  pixel[2] = f.z;
+  pixel[3] = f.w * scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float4(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = alpha;
+}
+
+ccl_device_inline void film_get_pass_pixel_combined(const KernelFilmConvert *ccl_restrict
+                                                        kfilm_convert,
+                                                    ccl_global const float *ccl_restrict buffer,
+                                                    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+
+  /* 3rd channel contains transparency = 1 - alpha for the combined pass. */
+
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    pixel[0] = 0.0f;
+    pixel[1] = 0.0f;
+    pixel[2] = 0.0f;
+    pixel[3] = 0.0f;
+    return;
   }
 
-  return pass_result;
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = film_transparency_to_alpha(alpha);
 }
 
-ccl_device float4 film_map(KernelGlobals *kg, float4 rgba_in, float scale)
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_device_inline float3
+film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
 {
-  float4 result;
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
 
-  /* Conversion to SRGB. */
-  result.x = color_linear_to_srgb(rgba_in.x * scale);
-  result.y = color_linear_to_srgb(rgba_in.y * scale);
-  result.z = color_linear_to_srgb(rgba_in.z * scale);
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
 
-  /* Clamp since alpha might be > 1.0 due to Russian roulette. */
-  result.w = saturate(rgba_in.w * scale);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
 
-  return result;
+  const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure;
+
+  return pixel;
 }
 
-ccl_device uchar4 film_float_to_byte(float4 color)
+ccl_device_inline float3 safe_divide_shadow_catcher(float3 a, float3 b)
 {
-  uchar4 result;
+  float x, y, z;
 
-  /* simple float to byte conversion */
-  result.x = (uchar)(saturate(color.x) * 255.0f);
-  result.y = (uchar)(saturate(color.y) * 255.0f);
-  result.z = (uchar)(saturate(color.z) * 255.0f);
-  result.w = (uchar)(saturate(color.w) * 255.0f);
+  x = (b.x != 0.0f) ? a.x / b.x : 1.0f;
+  y = (b.y != 0.0f) ? a.y / b.y : 1.0f;
+  z = (b.z != 0.0f) ? a.z / b.z : 1.0f;
 
-  return result;
+  return make_float3(x, y, z);
 }
 
-ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg,
-                                            ccl_global uchar4 *rgba,
-                                            ccl_global float *buffer,
-                                            float sample_scale,
-                                            int x,
-                                            int y,
-                                            int offset,
-                                            int stride)
+ccl_device_inline float3
+film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                              ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* For the shadow catcher pass we divide combined pass by the shadow catcher.
+   * Note that denoised shadow catcher pass contains value which only needs ot be scaled (but not
+   * to be calculated as division). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  if (kfilm_convert->is_denoised) {
+    return film_calculate_shadow_catcher_denoised(kfilm_convert, buffer);
+  }
 
-  /* map colors */
-  float4 float_result = film_map(kg, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
-  uchar4 uchar_result = film_float_to_byte(float_result);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_sample_count != PASS_UNUSED);
 
-  rgba += index;
-  *rgba = uchar_result;
+  /* If there is no shadow catcher object in this pixel, there is no modification of the light
+   * needed, so return one. */
+  ccl_global const float *in_catcher_sample_count =
+      buffer + kfilm_convert->pass_shadow_catcher_sample_count;
+  const float num_samples = in_catcher_sample_count[0];
+  if (num_samples == 0.0f) {
+    return one_float3();
+  }
+
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
+
+  /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
+   * shadow catcher objects in the scene. In this case there will be no auxillary passes required
+   * for the devision (to save up memory). So delay the asserts to this point so that the number of
+   * samples check handles such configuration. */
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  /* No scaling needed. The integration works in way that number of samples in the combined and
+   * shadow catcher passes are the same, and exposure is cancelled during the division. */
+  const float3 color_catcher = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]);
+  const float3 color_combined = make_float3(in_combined[0], in_combined[1], in_combined[2]);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]);
+
+  /* Need to ignore contribution of the matte object when doing division (otherwise there will be
+   * artifacts caused by anti-aliasing). Since combined pass is used for adaptive sampling and need
+   * to contain matte objects, we subtrack matte objects contribution here. This is the same as if
+   * the matte objects were not accumulated to the combined pass. */
+  const float3 combined_no_matte = color_combined - color_matte;
+
+  const float3 shadow_catcher = safe_divide_shadow_catcher(combined_no_matte, color_catcher);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+  const float transparency = in_combined[3] * scale;
+  const float alpha = film_transparency_to_alpha(transparency);
+
+  /* Alpha-over on white using transparency of the combined pass. This allows to eliminate
+   * artifacts which are happenning on an edge of a shadow catcher when using transparent film.
+   * Note that we treat shadow catcher as straight alpha here because alpha got cancelled out
+   * during the division. */
+  const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher;
+
+  return pixel;
 }
 
-ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
-                                                  ccl_global uchar4 *rgba,
-                                                  ccl_global float *buffer,
-                                                  float sample_scale,
-                                                  int x,
-                                                  int y,
-                                                  int offset,
-                                                  int stride)
+ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* The approximation of the shadow is 1 - average(shadow_catcher_pass). A better approximation
+   * is possible.
+   *
+   * The matte is alpha-overed onto the shadow (which is kind of alpha-overing shadow onto footage,
+   * and then alpha-overing synthetic objects on top). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure;
+
+  const float transparency = in_matte[3] * scale;
+  const float alpha = saturate(1.0f - transparency);
+
+  const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;
+
+  if (kfilm_convert->use_approximate_shadow_catcher_background) {
+    kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
+
+    ccl_global const float *in_background = buffer + kfilm_convert->pass_background;
+    const float3 color_background = make_float3(
+                                        in_background[0], in_background[1], in_background[2]) *
+                                    scale_exposure;
+    const float3 alpha_over = color_matte + color_background * (1.0f - alpha_matte);
+    return make_float4(alpha_over.x, alpha_over.y, alpha_over.z, 1.0f);
+  }
 
-  ccl_global half *out = (ccl_global half *)rgba + index * 4;
-  float4_store_half(out, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
+  return make_float4(color_matte.x, color_matte.y, color_matte.z, alpha_matte);
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+
+  const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 3 || kfilm_convert->num_components == 4);
+
+  const float4 pixel_value = film_calculate_shadow_catcher_matte_with_shadow(kfilm_convert,
+                                                                             buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+  if (kfilm_convert->num_components == 4) {
+    pixel[3] = pixel_value.w;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Compositing and overlays.
+ */
+
+ccl_device_inline void film_apply_pass_pixel_overlays_rgba(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  if (kfilm_convert->show_active_pixels &&
+      kfilm_convert->pass_adaptive_aux_buffer != PASS_UNUSED) {
+    if (buffer[kfilm_convert->pass_adaptive_aux_buffer + 3] == 0.0f) {
+      const float3 active_rgb = make_float3(1.0f, 0.0f, 0.0f);
+      const float3 mix_rgb = interp(make_float3(pixel[0], pixel[1], pixel[2]), active_rgb, 0.5f);
+      pixel[0] = mix_rgb.x;
+      pixel[1] = mix_rgb.y;
+      pixel[2] = mix_rgb.z;
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
deleted file mode 100644
index 70aed6d54ed..00000000000
--- a/intern/cycles/kernel/kernel_globals.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Constant Globals */
-
-#ifndef __KERNEL_GLOBALS_H__
-#define __KERNEL_GLOBALS_H__
-
-#include "kernel/kernel_profiling.h"
-
-#ifdef __KERNEL_CPU__
-#  include "util/util_map.h"
-#  include "util/util_vector.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "util/util_atomic.h"
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
- * the kernel, to access constant data. These are all stored as "textures", but
- * these are really just standard arrays. We can't use actually globals because
- * multiple renders may be running inside the same process. */
-
-#ifdef __KERNEL_CPU__
-
-#  ifdef __OSL__
-struct OSLGlobals;
-struct OSLThreadData;
-struct OSLShadingSystem;
-#  endif
-
-typedef unordered_map<float, float> CoverageMap;
-
-struct Intersection;
-struct VolumeStep;
-
-typedef struct KernelGlobals {
-#  define KERNEL_TEX(type, name) texture<type> name;
-#  include "kernel/kernel_textures.h"
-
-  KernelData __data;
-
-#  ifdef __OSL__
-  /* On the CPU, we also have the OSL globals here. Most data structures are shared
-   * with SVM, the difference is in the shaders and object/mesh attributes. */
-  OSLGlobals *osl;
-  OSLShadingSystem *osl_ss;
-  OSLThreadData *osl_tdata;
-#  endif
-
-  /* **** Run-time data **** */
-
-  /* Heap-allocated storage for transparent shadows intersections. */
-  Intersection *transparent_shadow_intersections;
-
-  /* Storage for decoupled volume steps. */
-  VolumeStep *decoupled_volume_steps[2];
-  int decoupled_volume_steps_index;
-
-  /* A buffer for storing per-pixel coverage for Cryptomatte. */
-  CoverageMap *coverage_object;
-  CoverageMap *coverage_material;
-  CoverageMap *coverage_asset;
-
-  /* split kernel */
-  SplitData split_data;
-  SplitParams split_param_data;
-
-  int2 global_size;
-  int2 global_id;
-
-  ProfilingState profiler;
-} KernelGlobals;
-
-#endif /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_OPTIX__
-
-typedef struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-} ShaderParams;
-
-typedef struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-} KernelParams;
-
-typedef struct KernelGlobals {
-#  ifdef __VOLUME__
-  VolumeState volume_state;
-#  endif
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-extern "C" __constant__ KernelParams __params;
-
-#else /* __KERNEL_OPTIX__ */
-
-/* For CUDA, constant memory textures must be globals, so we can't put them
- * into a struct. As a result we don't actually use this struct and use actual
- * globals and simply pass along a NULL pointer everywhere, which we hope gets
- * optimized out. */
-
-#  ifdef __KERNEL_CUDA__
-
-__constant__ KernelData __data;
-typedef struct KernelGlobals {
-  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-#    define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#    include "kernel/kernel_textures.h"
-
-#  endif /* __KERNEL_CUDA__ */
-
-#endif /* __KERNEL_OPTIX__ */
-
-/* OpenCL */
-
-#ifdef __KERNEL_OPENCL__
-
-#  define KERNEL_TEX(type, name) typedef type name##_t;
-#  include "kernel/kernel_textures.h"
-
-typedef ccl_addr_space struct KernelGlobals {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-
-#  ifdef __SPLIT_KERNEL__
-  SplitData split_data;
-  SplitParams split_param_data;
-#  endif
-} KernelGlobals;
-
-#  define KERNEL_BUFFER_PARAMS \
-    ccl_global char *buffer0, ccl_global char *buffer1, ccl_global char *buffer2, \
-        ccl_global char *buffer3, ccl_global char *buffer4, ccl_global char *buffer5, \
-        ccl_global char *buffer6, ccl_global char *buffer7
-
-#  define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
-
-ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    kg->buffers[0] = buffer0;
-    kg->buffers[1] = buffer1;
-    kg->buffers[2] = buffer2;
-    kg->buffers[3] = buffer3;
-    kg->buffers[4] = buffer4;
-    kg->buffers[5] = buffer5;
-    kg->buffers[6] = buffer6;
-    kg->buffers[7] = buffer7;
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    ccl_global TextureInfo *info = (ccl_global TextureInfo *)kg->buffers[0];
-
-#  define KERNEL_TEX(type, name) kg->name = *(info++);
-#  include "kernel/kernel_textures.h"
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-#endif /* __KERNEL_OPENCL__ */
-
-/* Interpolated lookup table access */
-
-ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
-{
-  x = saturate(x) * (size - 1);
-
-  int index = min(float_to_int(x), size - 1);
-  int nindex = min(index + 1, size - 1);
-  float t = x - index;
-
-  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-ccl_device float lookup_table_read_2D(
-    KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
-{
-  y = saturate(y) * (ysize - 1);
-
-  int index = min(float_to_int(y), ysize - 1);
-  int nindex = min(index + 1, ysize - 1);
-  float t = y - index;
-
-  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
index 1ca42e933d1..ed01f494f98 100644
--- a/intern/cycles/kernel/kernel_id_passes.h
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -14,8 +14,18 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
+/* Element of ID pass stored in the render buffers.
+ * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
+ * render buffers might not meet expected by compiler alignment. */
+typedef struct IDPassBufferElement {
+  float x;
+  float y;
+} IDPassBufferElement;
+
 ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
                                              int num_slots,
                                              float id,
@@ -27,7 +37,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
   }
 
   for (int slot = 0; slot < num_slots; slot++) {
-    ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+    ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
 #ifdef __ATOMIC_PASS_WRITE__
     /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
     if (id_buffer[slot].x == ID_NONE) {
@@ -65,7 +75,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
 
 ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
 {
-  ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+  ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
   for (int slot = 1; slot < num_slots; ++slot) {
     if (id_buffer[slot].x == ID_NONE) {
       return;
@@ -73,7 +83,7 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
     /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
     int i = slot;
     while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
-      float2 swap = id_buffer[i];
+      const IDPassBufferElement swap = id_buffer[i];
       id_buffer[i] = id_buffer[i - 1];
       id_buffer[i - 1] = swap;
       --i;
@@ -81,19 +91,16 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
   }
 }
 
-#ifdef __KERNEL_GPU__
 /* post-sorting for Cryptomatte */
-ccl_device void kernel_cryptomatte_post(
-    KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+ccl_device_inline void kernel_cryptomatte_post(const KernelGlobals *kg,
+                                               ccl_global float *render_buffer,
+                                               int pixel_index)
 {
-  if (sample - 1 == kernel_data.integrator.aa_samples) {
-    int index = offset + x + y * stride;
-    int pass_stride = kernel_data.film.pass_stride;
-    ccl_global float *cryptomatte_buffer = buffer + index * pass_stride +
-                                           kernel_data.film.pass_cryptomatte;
-    kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-  }
+  const int pass_stride = kernel_data.film.pass_stride;
+  const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
+  ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
+                                         kernel_data.film.pass_cryptomatte;
+  kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
 }
-#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index f4e60a807f7..354e8115538 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,93 +14,27 @@
  * limitations under the License.
  */
 
-/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
-
+#pragma once
 CCL_NAMESPACE_BEGIN
 
-/* "Correlated Multi-Jittered Sampling"
- * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */
-
-/* TODO: find good value, suggested 64 gives pattern on cornell box ceiling. */
-#define CMJ_RANDOM_OFFSET_LIMIT 4096
-
-ccl_device_inline bool cmj_is_pow2(int i)
+ccl_device_inline uint32_t laine_karras_permutation(uint32_t x, uint32_t seed)
 {
-  return (i > 1) && ((i & (i - 1)) == 0);
-}
+  x += seed;
+  x ^= (x * 0x6c50b47cu);
+  x ^= x * 0xb82f1e52u;
+  x ^= x * 0xc7afe638u;
+  x ^= x * 0x8d22f6e6u;
 
-ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
-{
-  return (a & (b - 1));
+  return x;
 }
 
-/* b must be > 1 */
-ccl_device_inline int cmj_fast_div_pow2(int a, int b)
+ccl_device_inline uint32_t nested_uniform_scramble(uint32_t x, uint32_t seed)
 {
-  kernel_assert(b > 1);
-  return a >> count_trailing_zeros(b);
-}
+  x = reverse_integer_bits(x);
+  x = laine_karras_permutation(x, seed);
+  x = reverse_integer_bits(x);
 
-ccl_device_inline uint cmj_w_mask(uint w)
-{
-  kernel_assert(w > 1);
-  return ((1 << (32 - count_leading_zeros(w))) - 1);
-}
-
-ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
-{
-  uint w = l - 1;
-
-  if ((l & w) == 0) {
-    /* l is a power of two (fast) */
-    i ^= p;
-    i *= 0xe170893d;
-    i ^= p >> 16;
-    i ^= (i & w) >> 4;
-    i ^= p >> 8;
-    i *= 0x0929eb3f;
-    i ^= p >> 23;
-    i ^= (i & w) >> 1;
-    i *= 1 | p >> 27;
-    i *= 0x6935fa69;
-    i ^= (i & w) >> 11;
-    i *= 0x74dcb303;
-    i ^= (i & w) >> 2;
-    i *= 0x9e501cc3;
-    i ^= (i & w) >> 2;
-    i *= 0xc860a3df;
-    i &= w;
-    i ^= i >> 5;
-
-    return (i + p) & w;
-  }
-  else {
-    /* l is not a power of two (slow) */
-    w = cmj_w_mask(w);
-
-    do {
-      i ^= p;
-      i *= 0xe170893d;
-      i ^= p >> 16;
-      i ^= (i & w) >> 4;
-      i ^= p >> 8;
-      i *= 0x0929eb3f;
-      i ^= p >> 23;
-      i ^= (i & w) >> 1;
-      i *= 1 | p >> 27;
-      i *= 0x6935fa69;
-      i ^= (i & w) >> 11;
-      i *= 0x74dcb303;
-      i ^= (i & w) >> 2;
-      i *= 0x9e501cc3;
-      i ^= (i & w) >> 2;
-      i *= 0xc860a3df;
-      i &= w;
-      i ^= i >> 5;
-    } while (i >= l);
-
-    return (i + p) % l;
-  }
+  return x;
 }
 
 ccl_device_inline uint cmj_hash(uint i, uint p)
@@ -133,99 +67,101 @@ ccl_device_inline float cmj_randfloat(uint i, uint p)
   return cmj_hash(i, p) * (1.0f / 4294967808.0f);
 }
 
-#ifdef __CMJ__
-ccl_device float cmj_sample_1D(int s, int N, int p)
+ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
 {
-  kernel_assert(s < N);
-
-  uint x = cmj_permute(s, N, p * 0x68bc21eb);
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-
-  float invN = 1.0f / N;
-  return (x + jx) * invN;
+  return cmj_hash_simple(i, p) * (1.0f / (float)0xFFFFFFFF);
 }
 
-/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
-ccl_device_inline int cmj_isqrt(int value)
+ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
 {
-#  if defined(__KERNEL_CUDA__)
-  return float_to_int(__fsqrt_ru(value));
-#  elif defined(__KERNEL_GPU__)
-  return float_to_int(sqrtf(value));
-#  else
-  /* This is a work around for fast-math on CPU which might replace sqrtf()
-   * with am approximated version.
-   */
-  return float_to_int(sqrtf(value) + 1e-6f);
-#  endif
-}
+  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
+   *  the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
+   * independently). */
+
+  /* Perform Owen shuffle of the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
-{
-  kernel_assert(s < N);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
+
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
 
-  int m = cmj_isqrt(N);
-  int n = (N - 1) / m + 1;
-  float invN = 1.0f / N;
-  float invm = 1.0f / m;
-  float invn = 1.0f / n;
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+#  else
+  /* Only jitter within the grid interval. */
+  float dx = cmj_randfloat(d, rng_hash);
+#  endif
+  fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
+  fx = fx - floorf(fx);
 
-  s = cmj_permute(s, N, p * 0x51633e2d);
+#else
+#  warning "Not using Cranley-Patterson Rotation."
+#endif
 
-  int sdivm, smodm;
+  return fx;
+}
 
-  if (cmj_is_pow2(m)) {
-    sdivm = cmj_fast_div_pow2(s, m);
-    smodm = cmj_fast_mod_pow2(s, m);
-  }
-  else {
-    /* Doing `s * inmv` gives precision issues here. */
-    sdivm = s / m;
-    smodm = s - sdivm * m;
-  }
+ccl_device void pmj_sample_2D(
+    const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension, float *x, float *y)
+{
+  /* Perform a shuffle on the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-  uint sx = cmj_permute(smodm, m, p * 0x68bc21eb);
-  uint sy = cmj_permute(sdivm, n, p * 0x02e5be93);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
 
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-  float jy = cmj_randfloat(s, p * 0x368cc8b7);
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
+  float fy = kernel_tex_fetch(__sample_pattern_lut, index + 1);
 
-  *fx = (sx + (sy + jx) * invn) * invm;
-  *fy = (s + jy) * invN;
-}
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+  float dy = cmj_randfloat_simple(d + 1, rng_hash);
+#  else
+  float dx = cmj_randfloat(d, rng_hash);
+  float dy = cmj_randfloat(d + 1, rng_hash);
+#  endif
+  /* Only jitter within the grid cells. */
+  fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
+  fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
+  fx = fx - floorf(fx);
+  fy = fy - floorf(fy);
+#else
+#  warning "Not using Cranley Patterson Rotation."
 #endif
 
-ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  /* Fallback to random */
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    return cmj_randfloat(sample, p);
-  }
-  else {
-    const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f;
-  }
-}
-
-ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    const float fx = cmj_randfloat(sample, p);
-    const float fy = cmj_randfloat(sample, p + 1);
-    return make_float2(fx, fy);
-  }
-  else {
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff;
-    const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f;
-    const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) -
-                     1.0f;
-    return make_float2(fx, fy);
-  }
+  (*x) = fx;
+  (*y) = fy;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 42a834d2ce3..52f641634b9 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -14,7 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "geom/geom.h"
+
 #include "kernel_light_background.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+#include "kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -37,10 +44,22 @@ typedef struct LightSample {
 
 /* Regular Light */
 
-ccl_device_inline bool lamp_light_sample(
-    KernelGlobals *kg, int lamp, float randu, float randv, float3 P, LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_inline bool light_sample(const KernelGlobals *kg,
+                                    const int lamp,
+                                    const float randu,
+                                    const float randv,
+                                    const float3 P,
+                                    const int path_flag,
+                                    LightSample *ls)
 {
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+      return false;
+    }
+  }
+
   LightType type = (LightType)klight->type;
   ls->type = type;
   ls->shader = klight->shader_id;
@@ -50,6 +69,18 @@ ccl_device_inline bool lamp_light_sample(
   ls->u = randu;
   ls->v = randv;
 
+  if (in_volume_segment && (type == LIGHT_DISTANT || type == LIGHT_BACKGROUND)) {
+    /* Distant lights in a volume get a dummy sample, position will not actually
+     * be used in that case. Only when sampling from a specific scatter position
+     * do we actually need to evaluate these. */
+    ls->P = zero_float3();
+    ls->Ng = zero_float3();
+    ls->D = zero_float3();
+    ls->pdf = true;
+    ls->t = FLT_MAX;
+    return true;
+  }
+
   if (type == LIGHT_DISTANT) {
     /* distant light */
     float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
@@ -123,13 +154,15 @@ ccl_device_inline bool lamp_light_sample(
       float invarea = fabsf(klight->area.invarea);
       bool is_round = (klight->area.invarea < 0.0f);
 
-      if (dot(ls->P - P, Ng) > 0.0f) {
-        return false;
+      if (!in_volume_segment) {
+        if (dot(ls->P - P, Ng) > 0.0f) {
+          return false;
+        }
       }
 
       float3 inplane;
 
-      if (is_round) {
+      if (is_round || in_volume_segment) {
         inplane = ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
         ls->P += inplane;
         ls->pdf = invarea;
@@ -176,79 +209,180 @@ ccl_device_inline bool lamp_light_sample(
   return (ls->pdf > 0.0f);
 }
 
-ccl_device bool lamp_light_eval(
-    KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+ccl_device bool lights_intersect(const KernelGlobals *ccl_restrict kg,
+                                 const Ray *ccl_restrict ray,
+                                 Intersection *ccl_restrict isect,
+                                 const int last_prim,
+                                 const int last_object,
+                                 const int last_type,
+                                 const int path_flag)
 {
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
-  LightType type = (LightType)klight->type;
-  ls->type = type;
-  ls->shader = klight->shader_id;
-  ls->object = PRIM_NONE;
-  ls->prim = PRIM_NONE;
-  ls->lamp = lamp;
-  /* todo: missing texture coordinates */
-  ls->u = 0.0f;
-  ls->v = 0.0f;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
 
-  if (!(ls->shader & SHADER_USE_MIS))
-    return false;
+    if (path_flag & PATH_RAY_CAMERA) {
+      if (klight->shader_id & SHADER_EXCLUDE_CAMERA) {
+        continue;
+      }
+    }
+    else {
+      if (!(klight->shader_id & SHADER_USE_MIS)) {
+        continue;
+      }
+    }
 
-  if (type == LIGHT_DISTANT) {
-    /* distant light */
-    float radius = klight->distant.radius;
+    if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+      if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+        continue;
+      }
+    }
 
-    if (radius == 0.0f)
-      return false;
-    if (t != FLT_MAX)
-      return false;
+    LightType type = (LightType)klight->type;
+    float t = 0.0f, u = 0.0f, v = 0.0f;
 
-    /* a distant light is infinitely far away, but equivalent to a disk
-     * shaped light exactly 1 unit away from the current shading point.
-     *
-     *     radius              t^2/cos(theta)
-     *  <---------->           t = sqrt(1^2 + tan(theta)^2)
-     *       tan(th)           area = radius*radius*pi
-     *       <----->
-     *        \    |           (1 + tan(theta)^2)/cos(theta)
-     *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
-     *       t  \th| 1         simplifies to
-     *           \-|           1/(cos(theta)^3)
-     *            \|           magic!
-     *             P
-     */
+    if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+      /* Sphere light. */
+      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+      const float radius = klight->spot.radius;
+      if (radius == 0.0f) {
+        continue;
+      }
 
-    float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-    float costheta = dot(-lightD, D);
-    float cosangle = klight->distant.cosangle;
+      float3 P;
+      if (!ray_aligned_disk_intersect(ray->P, ray->D, ray->t, lightP, radius, &P, &t)) {
+        continue;
+      }
+    }
+    else if (type == LIGHT_AREA) {
+      /* Area light. */
+      const float invarea = fabsf(klight->area.invarea);
+      const bool is_round = (klight->area.invarea < 0.0f);
+      if (invarea == 0.0f) {
+        continue;
+      }
 
-    if (costheta < cosangle)
-      return false;
+      const float3 axisu = make_float3(
+          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+      const float3 axisv = make_float3(
+          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+      const float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
 
-    ls->P = -D;
-    ls->Ng = -D;
-    ls->D = D;
-    ls->t = FLT_MAX;
+      /* One sided. */
+      if (dot(ray->D, Ng) >= 0.0f) {
+        continue;
+      }
 
-    /* compute pdf */
-    float invarea = klight->distant.invarea;
-    ls->pdf = invarea / (costheta * costheta * costheta);
-    ls->eval_fac = ls->pdf;
+      const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+
+      float3 P;
+      if (!ray_quad_intersect(
+              ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) {
+        continue;
+      }
+    }
+    else {
+      continue;
+    }
+
+    if (t < isect->t &&
+        !(last_prim == lamp && last_object == OBJECT_NONE && last_type == PRIMITIVE_LAMP)) {
+      isect->t = t;
+      isect->u = u;
+      isect->v = v;
+      isect->type = PRIMITIVE_LAMP;
+      isect->prim = lamp;
+      isect->object = OBJECT_NONE;
+    }
+  }
+
+  return isect->prim != PRIM_NONE;
+}
+
+ccl_device bool light_sample_from_distant_ray(const KernelGlobals *ccl_restrict kg,
+                                              const float3 ray_D,
+                                              const int lamp,
+                                              LightSample *ccl_restrict ls)
+{
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  const int shader = klight->shader_id;
+  const float radius = klight->distant.radius;
+  const LightType type = (LightType)klight->type;
+
+  if (type != LIGHT_DISTANT) {
+    return false;
+  }
+  if (!(shader & SHADER_USE_MIS)) {
+    return false;
+  }
+  if (radius == 0.0f) {
+    return false;
   }
-  else if (type == LIGHT_POINT || type == LIGHT_SPOT) {
-    float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    float radius = klight->spot.radius;
+  /* a distant light is infinitely far away, but equivalent to a disk
+   * shaped light exactly 1 unit away from the current shading point.
+   *
+   *     radius              t^2/cos(theta)
+   *  <---------->           t = sqrt(1^2 + tan(theta)^2)
+   *       tan(th)           area = radius*radius*pi
+   *       <----->
+   *        \    |           (1 + tan(theta)^2)/cos(theta)
+   *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
+   *       t  \th| 1         simplifies to
+   *           \-|           1/(cos(theta)^3)
+   *            \|           magic!
+   *             P
+   */
+
+  float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+  float costheta = dot(-lightD, ray_D);
+  float cosangle = klight->distant.cosangle;
+
+  if (costheta < cosangle)
+    return false;
 
-    /* sphere light */
-    if (radius == 0.0f)
-      return false;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->u = 0.0f;
+  ls->v = 0.0f;
+  ls->t = FLT_MAX;
+  ls->P = -ray_D;
+  ls->Ng = -ray_D;
+  ls->D = ray_D;
+
+  /* compute pdf */
+  float invarea = klight->distant.invarea;
+  ls->pdf = invarea / (costheta * costheta * costheta);
+  ls->pdf *= kernel_data.integrator.pdf_lights;
+  ls->eval_fac = ls->pdf;
 
-    if (!ray_aligned_disk_intersect(P, D, t, lightP, radius, &ls->P, &ls->t)) {
-      return false;
-    }
+  return true;
+}
 
-    ls->Ng = -D;
-    ls->D = D;
+ccl_device bool light_sample_from_intersection(const KernelGlobals *ccl_restrict kg,
+                                               const Intersection *ccl_restrict isect,
+                                               const float3 ray_P,
+                                               const float3 ray_D,
+                                               LightSample *ccl_restrict ls)
+{
+  const int lamp = isect->prim;
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  LightType type = (LightType)klight->type;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->t = isect->t;
+  ls->P = ray_P + ray_D * ls->t;
+  ls->D = ray_D;
+
+  if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+    ls->Ng = -ray_D;
 
     float invarea = klight->spot.invarea;
     ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
@@ -260,8 +394,9 @@ ccl_device bool lamp_light_eval(
       ls->eval_fac *= spot_light_attenuation(
           dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
 
-      if (ls->eval_fac == 0.0f)
+      if (ls->eval_fac == 0.0f) {
         return false;
+      }
     }
     float2 uv = map_to_sphere(ls->Ng);
     ls->u = uv.x;
@@ -274,31 +409,22 @@ ccl_device bool lamp_light_eval(
   else if (type == LIGHT_AREA) {
     /* area light */
     float invarea = fabsf(klight->area.invarea);
-    bool is_round = (klight->area.invarea < 0.0f);
-    if (invarea == 0.0f)
-      return false;
 
     float3 axisu = make_float3(
         klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
     float3 axisv = make_float3(
         klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
     float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
-    /* one sided */
-    if (dot(D, Ng) >= 0.0f)
-      return false;
-
     float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    if (!ray_quad_intersect(
-            P, D, 0.0f, t, light_P, axisu, axisv, Ng, &ls->P, &ls->t, &ls->u, &ls->v, is_round)) {
-      return false;
-    }
-
-    ls->D = D;
+    ls->u = isect->u;
+    ls->v = isect->v;
+    ls->D = ray_D;
     ls->Ng = Ng;
+
+    const bool is_round = (klight->area.invarea < 0.0f);
     if (is_round) {
-      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t);
+      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -ray_D, ls->t);
     }
     else {
       float3 sample_axisu = axisu;
@@ -306,12 +432,12 @@ ccl_device bool lamp_light_eval(
 
       if (klight->area.tan_spread > 0.0f) {
         if (!light_spread_clamp_area_light(
-                P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
+                ray_P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
           return false;
         }
       }
 
-      ls->pdf = rect_light_sample(P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
+      ls->pdf = rect_light_sample(ray_P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
     }
     ls->eval_fac = 0.25f * invarea;
 
@@ -325,6 +451,7 @@ ccl_device bool lamp_light_eval(
     }
   }
   else {
+    kernel_assert(!"Invalid lamp type in light_sample_from_intersection");
     return false;
   }
 
@@ -337,7 +464,7 @@ ccl_device bool lamp_light_eval(
 
 /* returns true if the triangle is has motion blur or an instancing transform applied */
 ccl_device_inline bool triangle_world_space_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 V[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 V[3])
 {
   bool has_motion = false;
   const int object_flag = kernel_tex_fetch(__object_flag, object);
@@ -365,7 +492,7 @@ ccl_device_inline bool triangle_world_space_vertices(
   return has_motion;
 }
 
-ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
+ccl_device_inline float triangle_light_pdf_area(const KernelGlobals *kg,
                                                 const float3 Ng,
                                                 const float3 I,
                                                 float t)
@@ -379,7 +506,9 @@ ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
   return t * t * pdf / cos_pi;
 }
 
-ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+ccl_device_forceinline float triangle_light_pdf(const KernelGlobals *kg,
+                                                const ShaderData *sd,
+                                                float t)
 {
   /* A naive heuristic to decide between costly solid angle sampling
    * and simple area sampling, comparing the distance to the triangle plane
@@ -448,7 +577,8 @@ ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *s
   }
 }
 
-ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
+template<bool in_volume_segment>
+ccl_device_forceinline void triangle_light_sample(const KernelGlobals *kg,
                                                   int prim,
                                                   int object,
                                                   float randu,
@@ -488,7 +618,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
   float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0));
 
-  if (longest_edge_squared > distance_to_plane * distance_to_plane) {
+  if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) {
     /* see James Arvo, "Stratified Sampling of Spherical Triangles"
      * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
 
@@ -617,7 +747,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
 /* Light Distribution */
 
-ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
+ccl_device int light_distribution_sample(const KernelGlobals *kg, float *randu)
 {
   /* This is basically std::upper_bound as used by PBRT, to find a point light or
    * triangle to emit from, proportional to area. a good improvement would be to
@@ -655,51 +785,93 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 
 /* Generic Light */
 
-ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+ccl_device_inline bool light_select_reached_max_bounces(const KernelGlobals *kg,
+                                                        int index,
+                                                        int bounce)
 {
   return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
 }
 
-ccl_device_noinline bool light_sample(KernelGlobals *kg,
-                                      int lamp,
-                                      float randu,
-                                      float randv,
-                                      float time,
-                                      float3 P,
-                                      int bounce,
-                                      LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_noinline bool light_distribution_sample(const KernelGlobals *kg,
+                                                   float randu,
+                                                   const float randv,
+                                                   const float time,
+                                                   const float3 P,
+                                                   const int bounce,
+                                                   const int path_flag,
+                                                   LightSample *ls)
 {
-  if (lamp < 0) {
-    /* sample index */
-    int index = light_distribution_sample(kg, &randu);
-
-    /* fetch light data */
-    const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(
-        __light_distribution, index);
-    int prim = kdistribution->prim;
-
-    if (prim >= 0) {
-      int object = kdistribution->mesh_light.object_id;
-      int shader_flag = kdistribution->mesh_light.shader_flag;
-
-      triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
-      ls->shader |= shader_flag;
-      return (ls->pdf > 0.0f);
+  /* Sample light index from distribution. */
+  const int index = light_distribution_sample(kg, &randu);
+  const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution,
+                                                                              index);
+  const int prim = kdistribution->prim;
+
+  if (prim >= 0) {
+    /* Mesh light. */
+    const int object = kdistribution->mesh_light.object_id;
+
+    /* Exclude synthetic meshes from shadow catcher pass. */
+    if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
+        !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
+      return false;
     }
 
-    lamp = -prim - 1;
+    const int shader_flag = kdistribution->mesh_light.shader_flag;
+    triangle_light_sample<in_volume_segment>(kg, prim, object, randu, randv, time, ls, P);
+    ls->shader |= shader_flag;
+    return (ls->pdf > 0.0f);
   }
 
+  const int lamp = -prim - 1;
+
   if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
     return false;
   }
 
-  return lamp_light_sample(kg, lamp, randu, randv, P, ls);
+  return light_sample<in_volume_segment>(kg, lamp, randu, randv, P, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_volume_segment(const KernelGlobals *kg,
+                                                                     float randu,
+                                                                     const float randv,
+                                                                     const float time,
+                                                                     const float3 P,
+                                                                     const int bounce,
+                                                                     const int path_flag,
+                                                                     LightSample *ls)
+{
+  return light_distribution_sample<true>(kg, randu, randv, time, P, bounce, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_position(const KernelGlobals *kg,
+                                                               float randu,
+                                                               const float randv,
+                                                               const float time,
+                                                               const float3 P,
+                                                               const int bounce,
+                                                               const int path_flag,
+                                                               LightSample *ls)
+{
+  return light_distribution_sample<false>(kg, randu, randv, time, P, bounce, path_flag, ls);
 }
 
-ccl_device_inline int light_select_num_samples(KernelGlobals *kg, int index)
+ccl_device_inline bool light_distribution_sample_new_position(const KernelGlobals *kg,
+                                                              const float randu,
+                                                              const float randv,
+                                                              const float time,
+                                                              const float3 P,
+                                                              LightSample *ls)
 {
-  return kernel_tex_fetch(__lights, index).samples;
+  /* Sample a new position on the same light, for volume sampling. */
+  if (ls->type == LIGHT_TRIANGLE) {
+    triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P);
+    return (ls->pdf > 0.0f);
+  }
+  else {
+    return light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h
index f0f64ce8704..493ed560bc6 100644
--- a/intern/cycles/kernel/kernel_light_background.h
+++ b/intern/cycles/kernel/kernel_light_background.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "kernel_light_common.h"
 
 CCL_NAMESPACE_BEGIN
@@ -22,7 +24,10 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BACKGROUND_MIS__
 
-ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+ccl_device float3 background_map_sample(const KernelGlobals *kg,
+                                        float randu,
+                                        float randv,
+                                        float *pdf)
 {
   /* for the following, the CDF values are actually a pair of floats, with the
    * function value as X and the actual CDF as Y.  The last entry's function
@@ -104,7 +109,7 @@ ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float ra
 /* TODO(sergey): Same as above, after the release we should consider using
  * 'noinline' for all devices.
  */
-ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
+ccl_device float background_map_pdf(const KernelGlobals *kg, float3 direction)
 {
   float2 uv = direction_to_equirectangular(direction);
   int res_x = kernel_data.background.map_res_x;
@@ -138,7 +143,7 @@ ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
 }
 
 ccl_device_inline bool background_portal_data_fetch_and_check_side(
-    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
+    const KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
 {
   int portal = kernel_data.background.portal_offset + index;
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
@@ -154,7 +159,7 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(
 }
 
 ccl_device_inline float background_portal_pdf(
-    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
+    const KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
 {
   float portal_pdf = 0.0f;
 
@@ -214,7 +219,7 @@ ccl_device_inline float background_portal_pdf(
   return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
 }
 
-ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+ccl_device int background_num_possible_portals(const KernelGlobals *kg, float3 P)
 {
   int num_possible_portals = 0;
   for (int p = 0; p < kernel_data.background.num_portals; p++) {
@@ -225,7 +230,7 @@ ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
   return num_possible_portals;
 }
 
-ccl_device float3 background_portal_sample(KernelGlobals *kg,
+ccl_device float3 background_portal_sample(const KernelGlobals *kg,
                                            float3 P,
                                            float randu,
                                            float randv,
@@ -280,7 +285,7 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
   return zero_float3();
 }
 
-ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
+ccl_device_inline float3 background_sun_sample(const KernelGlobals *kg,
                                                float randu,
                                                float randv,
                                                float *pdf)
@@ -292,7 +297,7 @@ ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
   return D;
 }
 
-ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
+ccl_device_inline float background_sun_pdf(const KernelGlobals *kg, float3 D)
 {
   const float3 N = float4_to_float3(kernel_data.background.sun);
   const float angle = kernel_data.background.sun.w;
@@ -300,7 +305,7 @@ ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
 }
 
 ccl_device_inline float3
-background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+background_light_sample(const KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
@@ -400,7 +405,7 @@ background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, f
   return D;
 }
 
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+ccl_device float background_light_pdf(const KernelGlobals *kg, float3 P, float3 direction)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
index 4a683d36226..765d8f5338e 100644
--- a/intern/cycles/kernel/kernel_light_common.h
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Area light sampling */
@@ -210,7 +214,7 @@ ccl_device bool light_spread_clamp_area_light(const float3 P,
   return true;
 }
 
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+ccl_device float lamp_light_pdf(const KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
   float cos_pi = dot(Ng, I);
 
diff --git a/intern/cycles/kernel/kernel_lookup_table.h b/intern/cycles/kernel/kernel_lookup_table.h
new file mode 100644
index 00000000000..33d9d5ae1f0
--- /dev/null
+++ b/intern/cycles/kernel/kernel_lookup_table.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Interpolated lookup table access */
+
+ccl_device float lookup_table_read(const KernelGlobals *kg, float x, int offset, int size)
+{
+  x = saturate(x) * (size - 1);
+
+  int index = min(float_to_int(x), size - 1);
+  int nindex = min(index + 1, size - 1);
+  float t = x - index;
+
+  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+ccl_device float lookup_table_read_2D(
+    const KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
+{
+  y = saturate(y) * (ysize - 1);
+
+  int index = min(float_to_int(y), ysize - 1);
+  int nindex = min(index + 1, ysize - 1);
+  float t = y - index;
+
+  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 96391db7649..3c5ab95bbc8 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_MATH_H__
-#define __KERNEL_MATH_H__
+#pragma once
 
 #include "util/util_color.h"
 #include "util/util_math.h"
@@ -24,5 +23,3 @@
 #include "util/util_projection.h"
 #include "util/util_texture.h"
 #include "util/util_transform.h"
-
-#endif /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index ce37bd0b15e..b158f4c4fd3 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_MONTECARLO_CL__
-#define __KERNEL_MONTECARLO_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -300,5 +299,3 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_MONTECARLO_CL__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 8f58b8c3079..67466b28170 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,61 +14,52 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/geom/geom.h"
+
 #include "kernel/kernel_id_passes.h"
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __DENOISING_FEATURES__
-
-ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
-                                                     ccl_global float *buffer,
-                                                     int sample,
-                                                     float path_total,
-                                                     float path_total_shaded)
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
 {
-  if (kernel_data.film.pass_denoising_data == 0)
-    return;
-
-  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
-                DENOISING_PASS_SHADOW_B :
-                DENOISING_PASS_SHADOW_A;
-
-  path_total = ensure_finite(path_total);
-  path_total_shaded = ensure_finite(path_total_shaded);
-
-  kernel_write_pass_float(buffer, path_total);
-  kernel_write_pass_float(buffer + 1, path_total_shaded);
-
-  float value = path_total_shaded / max(path_total, 1e-7f);
-  kernel_write_pass_float(buffer + 2, value * value);
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
+#ifdef __DENOISING_FEATURES__
+
+ccl_device_forceinline void kernel_write_denoising_features_surface(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (state->denoising_feature_weight == 0.0f) {
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DENOISING_FEATURES)) {
     return;
   }
 
-  L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
-
   /* Skip implicitly transparent surfaces. */
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return;
   }
 
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
   float3 normal = zero_float3();
   float3 diffuse_albedo = zero_float3();
   float3 specular_albedo = zero_float3();
   float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
-    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
       continue;
+    }
 
     /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
     normal += sc->N * sc->sample_weight;
@@ -106,140 +97,208 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
       normal /= sum_weight;
     }
 
-    /* Transform normal into camera space. */
-    const Transform worldtocamera = kernel_data.cam.worldtocamera;
-    normal = transform_direction(&worldtocamera, normal);
+    if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+      /* Transform normal into camera space. */
+      const Transform worldtocamera = kernel_data.cam.worldtocamera;
+      normal = transform_direction(&worldtocamera, normal);
+
+      const float3 denoising_normal = ensure_finite3(normal);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+    }
 
-    L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
-    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight *
-                                          state->denoising_feature_throughput * diffuse_albedo);
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
+                                                     diffuse_albedo);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
 
-    state->denoising_feature_weight = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
   }
   else {
-    state->denoising_feature_throughput *= specular_albedo;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) *= specular_albedo;
+  }
+}
+
+ccl_device_forceinline void kernel_write_denoising_features_volume(INTEGRATOR_STATE_ARGS,
+                                                                   const float3 albedo,
+                                                                   const bool scatter,
+                                                                   ccl_global float *ccl_restrict
+                                                                       render_buffer)
+{
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+  const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, denoising_feature_throughput);
+
+  if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+    /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+
+    /* Write view direction as normal. */
+    const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+  }
+
+  if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+    /* Write albedo. */
+    const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
   }
 }
 #endif /* __DENOISING_FEATURES__ */
 
-#ifdef __KERNEL_CPU__
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
-ccl_device_inline size_t kernel_write_id_pass_cpu(
-    float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (map) {
-    (*map)[id] += matte_weight;
-    return 0;
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, sd->object_flag)) {
+    return;
   }
-#else /* __KERNEL_CPU__ */
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight)
-ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer,
-                                                   size_t depth,
-                                                   float id,
-                                                   float matte_weight)
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+ccl_device_inline size_t kernel_write_id_pass(float *ccl_restrict buffer,
+                                              size_t depth,
+                                              float id,
+                                              float matte_weight)
 {
-#endif /* __KERNEL_CPU__ */
-  kernel_write_id_slots(buffer, depth, id, matte_weight);
-  return depth * 2;
+  kernel_write_id_slots(buffer, depth * 2, id, matte_weight);
+  return depth * 4;
 }
 
-ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
-                                                ccl_global float *buffer,
-                                                PathRadiance *L,
-                                                ShaderData *sd,
-                                                ccl_addr_space PathState *state,
-                                                float3 throughput)
+ccl_device_inline void kernel_write_data_passes(INTEGRATOR_STATE_ARGS,
+                                                const ShaderData *sd,
+                                                ccl_global float *ccl_restrict render_buffer)
 {
 #ifdef __PASSES__
-  int path_flag = state->flag;
+  const int path_flag = INTEGRATOR_STATE(path, flag);
 
-  if (!(path_flag & PATH_RAY_CAMERA))
+  if (!(path_flag & PATH_RAY_CAMERA)) {
     return;
+  }
 
-  int flag = kernel_data.film.pass_flag;
-  int light_flag = kernel_data.film.light_pass_flag;
+  const int flag = kernel_data.film.pass_flag;
 
-  if (!((flag | light_flag) & PASS_ANY))
+  if (!(flag & PASS_ANY)) {
     return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
 
   if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
     if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
         average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
-      if (state->sample == 0) {
+      if (INTEGRATOR_STATE(path, sample) == 0) {
         if (flag & PASSMASK(DEPTH)) {
-          float depth = camera_z_depth(kg, sd->P);
+          const float depth = camera_z_depth(kg, sd->P);
           kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
         }
         if (flag & PASSMASK(OBJECT_ID)) {
-          float id = object_pass_id(kg, sd->object);
+          const float id = object_pass_id(kg, sd->object);
           kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
         }
         if (flag & PASSMASK(MATERIAL_ID)) {
-          float id = shader_pass_id(kg, sd);
+          const float id = shader_pass_id(kg, sd);
           kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
         }
       }
 
+      if (flag & PASSMASK(POSITION)) {
+        const float3 position = sd->P;
+        kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position);
+      }
       if (flag & PASSMASK(NORMAL)) {
-        float3 normal = shader_bsdf_average_normal(kg, sd);
+        const float3 normal = shader_bsdf_average_normal(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
       }
+      if (flag & PASSMASK(ROUGHNESS)) {
+        const float roughness = shader_bsdf_average_roughness(sd);
+        kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
+      }
       if (flag & PASSMASK(UV)) {
-        float3 uv = primitive_uv(kg, sd);
+        const float3 uv = primitive_uv(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
       }
       if (flag & PASSMASK(MOTION)) {
-        float4 speed = primitive_motion_vector(kg, sd);
+        const float4 speed = primitive_motion_vector(kg, sd);
         kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
         kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
       }
 
-      state->flag |= PATH_RAY_SINGLE_PASS_DONE;
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
     }
   }
 
   if (kernel_data.film.cryptomatte_passes) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
     const float matte_weight = average(throughput) *
                                (1.0f - average(shader_bsdf_transparency(kg, sd)));
     if (matte_weight > 0.0f) {
       ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
       if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-        float id = object_cryptomatte_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+        const float id = object_cryptomatte_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-        float id = shader_cryptomatte_id(kg, sd->shader);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+        const float id = shader_cryptomatte_id(kg, sd->shader);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-        float id = object_cryptomatte_asset_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+        const float id = object_cryptomatte_asset_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
     }
   }
 
-  if (light_flag & PASSMASK_COMPONENT(DIFFUSE))
-    L->color_diffuse += shader_bsdf_diffuse(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(GLOSSY))
-    L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(TRANSMISSION))
-    L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput;
-
-  if (light_flag & PASSMASK(MIST)) {
-    /* bring depth into 0..1 range */
-    float mist_start = kernel_data.film.mist_start;
-    float mist_inv_depth = kernel_data.film.mist_inv_depth;
+  if (flag & PASSMASK(DIFFUSE_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color,
+                             shader_bsdf_diffuse(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(GLOSSY_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color,
+                             shader_bsdf_glossy(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(TRANSMISSION_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
+                             shader_bsdf_transmission(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(MIST)) {
+    /* Bring depth into 0..1 range. */
+    const float mist_start = kernel_data.film.mist_start;
+    const float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-    float depth = camera_distance(kg, sd->P);
+    const float depth = camera_distance(kg, sd->P);
     float mist = saturate((depth - mist_start) * mist_inv_depth);
 
-    /* falloff */
-    float mist_falloff = kernel_data.film.mist_falloff;
+    /* Falloff */
+    const float mist_falloff = kernel_data.film.mist_falloff;
 
     if (mist_falloff == 1.0f)
       ;
@@ -250,158 +309,17 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
     else
       mist = powf(mist, mist_falloff);
 
-    /* modulate by transparency */
-    float3 alpha = shader_bsdf_alpha(kg, sd);
-    L->mist += (1.0f - mist) * average(throughput * alpha);
-  }
-#endif
-}
+    /* Modulate by transparency */
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    const float3 alpha = shader_bsdf_alpha(kg, sd);
+    const float mist_output = (1.0f - mist) * average(throughput * alpha);
 
-ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
-                                                 ccl_global float *buffer,
-                                                 PathRadiance *L)
-{
-#ifdef __PASSES__
-  int light_flag = kernel_data.film.light_pass_flag;
-
-  if (!kernel_data.film.use_light_pass)
-    return;
-
-  if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect,
-                             L->indirect_transmission);
-  if (light_flag & PASSMASK(VOLUME_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume);
-  if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct,
-                             L->direct_transmission);
-  if (light_flag & PASSMASK(VOLUME_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume);
-
-  if (light_flag & PASSMASK(EMISSION))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
-  if (light_flag & PASSMASK(BACKGROUND))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background);
-  if (light_flag & PASSMASK(AO))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao);
-
-  if (light_flag & PASSMASK(DIFFUSE_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
-                             L->color_transmission);
-  if (light_flag & PASSMASK(SHADOW)) {
-    float3 shadow = L->shadow;
-    kernel_write_pass_float4(
-        buffer + kernel_data.film.pass_shadow,
-        make_float4(shadow.x, shadow.y, shadow.z, kernel_data.film.pass_shadow_scale));
+    /* Note that the final value in the render buffer we want is 1 - mist_output,
+     * to avoid having to tracking this in the Integrator state we do the negation
+     * after rendering. */
+    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
   }
-  if (light_flag & PASSMASK(MIST))
-    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist);
 #endif
 }
 
-ccl_device_inline void kernel_write_result(KernelGlobals *kg,
-                                           ccl_global float *buffer,
-                                           int sample,
-                                           PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
-  PROFILING_OBJECT(PRIM_NONE);
-
-  float alpha;
-  float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
-
-  if (kernel_data.film.pass_flag & PASSMASK(COMBINED)) {
-    kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
-  }
-
-  kernel_write_light_passes(kg, buffer, L);
-
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-#  ifdef __SHADOW_TRICKS__
-    kernel_write_denoising_shadow(kg,
-                                  buffer + kernel_data.film.pass_denoising_data,
-                                  sample,
-                                  average(L->path_total),
-                                  average(L->path_total_shaded));
-#  else
-    kernel_write_denoising_shadow(
-        kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
-#  endif
-    if (kernel_data.film.pass_denoising_clean) {
-      float3 noisy, clean;
-      path_radiance_split_denoising(kg, L, &noisy, &clean);
-      kernel_write_pass_float3_variance(
-          buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, noisy);
-      kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, clean);
-    }
-    else {
-      kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                            DENOISING_PASS_COLOR,
-                                        ensure_finite3(L_sum));
-    }
-
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_NORMAL,
-                                      L->denoising_normal);
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_ALBEDO,
-                                      L->denoising_albedo);
-    kernel_write_pass_float_variance(
-        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, L->denoising_depth);
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
-     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
-     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
-     tiles. */
-  if (kernel_data.film.pass_adaptive_aux_buffer &&
-      kernel_data.integrator.adaptive_threshold > 0.0f) {
-    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
-      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
-                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
-    }
-#ifdef __KERNEL_CPU__
-    if ((sample > kernel_data.integrator.adaptive_min_samples) &&
-        kernel_data.integrator.adaptive_stop_per_sample) {
-      const int step = kernel_data.integrator.adaptive_step;
-
-      if ((sample & (step - 1)) == (step - 1)) {
-        kernel_do_adaptive_stopping(kg, buffer, sample);
-      }
-    }
-#endif
-  }
-
-  /* Write the sample count as negative numbers initially to mark the samples as in progress.
-   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
-   * are scaled as if they were taken at a uniform sample count. */
-  if (kernel_data.film.pass_sample_count) {
-    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
-     * passes. */
-#ifdef __ATOMIC_PASS_WRITE__
-    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
-                               0x80000000);
-#else
-    if (buffer[kernel_data.film.pass_sample_count] > 0) {
-      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
-    }
-#endif
-    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
-  }
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
deleted file mode 100644
index 92a097de9e1..00000000000
--- a/intern/cycles/kernel/kernel_path.h
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-// clang-format off
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_passes.h"
-
-#if defined(__VOLUME__) || defined(__SUBSURFACE__)
-#  include "kernel/kernel_volume.h"
-#endif
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
-#endif
-
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg,
-                                                        ccl_addr_space PathState *state,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        PathRadiance *L,
-                                                        const int last_object)
-{
-  PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
-
-  uint visibility = path_state_ray_visibility(kg, state);
-
-  if (path_state_ao_bounce(kg, state)) {
-    ray->t = kernel_data.background.ao_distance;
-    if (last_object != OBJECT_NONE) {
-      const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
-      if (object_ao_distance != 0.0f) {
-        ray->t = object_ao_distance;
-      }
-    }
-  }
-
-  bool hit = scene_intersect(kg, ray, visibility, isect);
-
-  return hit;
-}
-
-ccl_device_forceinline void kernel_path_lamp_emission(KernelGlobals *kg,
-                                                      ccl_addr_space PathState *state,
-                                                      Ray *ray,
-                                                      float3 throughput,
-                                                      ccl_addr_space Intersection *isect,
-                                                      ShaderData *emission_sd,
-                                                      PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
-
-#ifdef __LAMP_MIS__
-  if (kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-    /* ray starting from previous non-transparent bounce */
-    Ray light_ray ccl_optional_struct_init;
-
-    light_ray.P = ray->P - state->ray_t * ray->D;
-    state->ray_t += isect->t;
-    light_ray.D = ray->D;
-    light_ray.t = state->ray_t;
-    light_ray.time = ray->time;
-    light_ray.dD = ray->dD;
-    light_ray.dP = ray->dP;
-
-    /* intersect with lamp */
-    indirect_lamp_emission(kg, emission_sd, state, L, &light_ray, throughput);
-  }
-#endif /* __LAMP_MIS__ */
-}
-
-ccl_device_forceinline void kernel_path_background(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_addr_space Ray *ray,
-                                                   float3 throughput,
-                                                   ShaderData *sd,
-                                                   ccl_global float *buffer,
-                                                   PathRadiance *L)
-{
-  /* eval background shader if nothing hit */
-  if (kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    L->transparent += average(throughput);
-
-#ifdef __PASSES__
-    if (!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND)))
-#endif /* __PASSES__ */
-      return;
-  }
-
-  /* When using the ao bounces approximation, adjust background
-   * shader intensity with ao factor. */
-  if (path_state_ao_bounce(kg, state)) {
-    throughput *= kernel_data.background.ao_bounces_factor;
-  }
-
-#ifdef __BACKGROUND__
-  /* sample background shader */
-  float3 L_background = indirect_background(kg, sd, state, buffer, ray);
-  path_radiance_accum_background(kg, L, state, throughput, L_background);
-#endif /* __BACKGROUND__ */
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  ifdef __VOLUME__
-ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *kg,
-                                                                ShaderData *sd,
-                                                                PathState *state,
-                                                                Ray *ray,
-                                                                float3 *throughput,
-                                                                ccl_addr_space Intersection *isect,
-                                                                bool hit,
-                                                                ShaderData *emission_sd,
-                                                                PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_VOLUME);
-
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return VOLUME_PATH_ATTENUATED;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-#    ifdef __VOLUME_DECOUPLED__
-  int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-  bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
-  bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method);
-
-  if (decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    volume_segment.sampling_method = sampling_method;
-
-    /* emission */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-
-    /* scattering */
-    VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      int all = kernel_data.integrator.sample_all_lights_indirect;
-
-      /* direct light sampling */
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect sample. if we use distance sampling and take just
-       * one sample for direct and indirect light, we could share
-       * this computation, but makes code a bit complex */
-      float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-      float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-      result = kernel_volume_decoupled_scatter(
-          kg, state, &volume_ray, sd, throughput, rphase, rscatter, &volume_segment, NULL, true);
-    }
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-
-    if (result == VOLUME_PATH_SCATTERED) {
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-    else {
-      *throughput *= volume_segment.accum_transmittance;
-    }
-  }
-  else
-#    endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#    ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      /* indirect light bounce */
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-#    endif /* __VOLUME_SCATTER__ */
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-#  endif /* __VOLUME__ */
-
-#endif /* __SPLIT_KERNEL__ */
-
-ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
-                                                     ShaderData *sd,
-                                                     ccl_addr_space PathState *state,
-                                                     ccl_addr_space Ray *ray,
-                                                     float3 throughput,
-                                                     ShaderData *emission_sd,
-                                                     PathRadiance *L,
-                                                     ccl_global float *buffer)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
-
-#ifdef __SHADOW_TRICKS__
-  if (sd->object_flag & SD_OBJECT_SHADOW_CATCHER) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
-      state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_STORE_SHADOW_INFO);
-
-      float3 bg = zero_float3();
-      if (!kernel_data.background.transparent) {
-        bg = indirect_background(kg, emission_sd, state, NULL, ray);
-      }
-      path_radiance_accum_shadowcatcher(L, throughput, bg);
-    }
-  }
-  else if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    /* Only update transparency after shadow catcher bounce. */
-    L->shadow_transparency *= average(shader_bsdf_transparency(kg, sd));
-  }
-#endif /* __SHADOW_TRICKS__ */
-
-  /* holdout */
-#ifdef __HOLDOUT__
-  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-      (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    const float3 holdout_weight = shader_holdout_apply(kg, sd);
-    if (kernel_data.background.transparent) {
-      L->transparent += average(holdout_weight * throughput);
-    }
-    if (isequal_float3(holdout_weight, one_float3())) {
-      return false;
-    }
-  }
-#endif /* __HOLDOUT__ */
-
-  /* holdout mask objects do not write data passes */
-  kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
-
-  /* blurring of bsdf after bounces, for rays that have a small likelihood
-   * of following this particular path (diffuse, rough glossy) */
-  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
-    float blur_pdf = kernel_data.integrator.filter_glossy * state->min_ray_pdf;
-
-    if (blur_pdf < 1.0f) {
-      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
-      shader_bsdf_blur(kg, sd, blur_roughness);
-    }
-  }
-
-#ifdef __EMISSION__
-  /* emission */
-  if (sd->flag & SD_EMISSION) {
-    float3 emission = indirect_primitive_emission(
-        kg, sd, sd->ray_length, state->flag, state->ray_pdf);
-    path_radiance_accum_emission(kg, L, state, throughput, emission);
-  }
-#endif /* __EMISSION__ */
-
-  return true;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    void
-    kernel_path_ao(KernelGlobals *kg,
-                   ShaderData *sd,
-                   ShaderData *emission_sd,
-                   PathRadiance *L,
-                   ccl_addr_space PathState *state,
-                   float3 throughput,
-                   float3 ao_alpha)
-{
-  PROFILING_INIT(kg, PROFILING_AO);
-
-  /* todo: solve correlation */
-  float bsdf_u, bsdf_v;
-
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_D;
-  float ao_pdf;
-
-  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-    Ray light_ray;
-    float3 ao_shadow;
-
-    light_ray.P = ray_offset(sd->P, sd->Ng);
-    light_ray.D = ao_D;
-    light_ray.t = kernel_data.background.ao_distance;
-    light_ray.time = sd->time;
-    light_ray.dP = sd->dP;
-    light_ray.dD = differential3_zero();
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-      path_radiance_accum_ao(kg, L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
-    }
-    else {
-      path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
-    }
-  }
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  if defined(__BRANCHED_PATH__) || defined(__BAKING__)
-
-ccl_device void kernel_path_indirect(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ShaderData *emission_sd,
-                                     Ray *ray,
-                                     float3 throughput,
-                                     PathState *state,
-                                     PathRadiance *L,
-                                     const int last_object)
-{
-#    ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#    endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, last_object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L);
-
-#    ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#    endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, sd, NULL, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-      if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, sd, state, NULL, state->flag);
-        shader_prepare_closures(sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, NULL)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#    ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, sd, state, L);
-#    endif
-
-#    ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, sd, emission_sd, L, state, throughput, zero_float3());
-        }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd->flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#    endif /* __SUBSURFACE__ */
-
-#    if defined(__EMISSION__)
-        int all = (kernel_data.integrator.sample_all_lights_indirect) ||
-                  (state->flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#    endif /* defined(__EMISSION__) */
-
-#    ifdef __VOLUME__
-      }
-#    endif
-
-      if (!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#    ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#    endif /* __SUBSURFACE__ */
-}
-
-#  endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
-
-ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
-                                                  PathState *state,
-                                                  float3 throughput,
-                                                  Ray *ray,
-                                                  PathRadiance *L,
-                                                  ccl_global float *buffer,
-                                                  ShaderData *emission_sd)
-{
-  PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
-
-  /* Shader data memory used for both volumes and surfaces, saves stack space. */
-  ShaderData sd;
-
-#  ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#  endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, sd.object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L);
-
-#  ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, &sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#  endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, &sd, buffer, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, &sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#  ifdef __VOLUME__
-      if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, &sd, state, buffer, state->flag);
-        shader_prepare_closures(&sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, &sd, state, ray, throughput, emission_sd, L, buffer)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#  ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, &sd, state, L);
-#  endif
-
-#  ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
-        }
-#  endif /* __AO__ */
-
-#  ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd.flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, &sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#  endif /* __SUBSURFACE__ */
-
-#  ifdef __EMISSION__
-        /* direct lighting */
-        kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
-#  endif /* __EMISSION__ */
-
-#  ifdef __VOLUME__
-      }
-#  endif
-
-      /* compute direct lighting and next bounce */
-      if (!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#  ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#  endif /* __SUBSURFACE__ */
-}
-
-ccl_device void kernel_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
-
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* Initialize random numbers and sample ray. */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  if (ray.t == 0.0f) {
-    return;
-  }
-
-  /* Initialize state. */
-  float3 throughput = one_float3();
-
-  PathRadiance L;
-  path_radiance_init(kg, &L);
-
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-#  ifdef __KERNEL_OPTIX__
-  /* Force struct into local memory to avoid costly spilling on trace calls. */
-  if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */
-    for (int i = 0; i < sizeof(L); ++i)
-      reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0;
-#  endif
-
-  /* Integrate. */
-  kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd);
-
-  kernel_write_result(kg, buffer, sample, &L);
-}
-
-#endif /* __SPLIT_KERNEL__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
deleted file mode 100644
index a1ee1bc107e..00000000000
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *emission_sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space PathState *state,
-                                               float3 throughput)
-{
-  int num_samples = kernel_data.integrator.ao_samples;
-  float num_samples_inv = 1.0f / num_samples;
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-  for (int j = 0; j < num_samples; j++) {
-    float bsdf_u, bsdf_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-    float3 ao_D;
-    float ao_pdf;
-
-    sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-    if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-      Ray light_ray;
-      float3 ao_shadow;
-
-      light_ray.P = ray_offset(sd->P, sd->Ng);
-      light_ray.D = ao_D;
-      light_ray.t = kernel_data.background.ao_distance;
-      light_ray.time = sd->time;
-      light_ray.dP = sd->dP;
-      light_ray.dD = differential3_zero();
-
-      if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-        path_radiance_accum_ao(
-            kg, L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
-      }
-      else {
-        path_radiance_accum_total_ao(L, state, throughput * num_samples_inv, ao_bsdf);
-      }
-    }
-  }
-}
-
-#  ifndef __SPLIT_KERNEL__
-
-#    ifdef __VOLUME__
-ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 *throughput,
-                                                        ccl_addr_space Intersection *isect,
-                                                        bool hit,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L)
-{
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  const int object = sd->object;
-
-#      ifdef __VOLUME_DECOUPLED__
-  /* decoupled ray marching only supported on CPU */
-  if (kernel_data.integrator.volume_decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    /* direct light sampling */
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-
-      int all = kernel_data.integrator.sample_all_lights_direct;
-
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect light sampling */
-      int num_samples = kernel_data.integrator.volume_samples;
-      float num_samples_inv = 1.0f / num_samples;
-
-      for (int j = 0; j < num_samples; j++) {
-        PathState ps = *state;
-        Ray pray = *ray;
-        float3 tp = *throughput;
-
-        /* branch RNG state */
-        path_state_branch(&ps, j, num_samples);
-
-        /* scatter sample. if we use distance sampling and take just one
-         * sample for direct and indirect light, we could share this
-         * computation, but makes code a bit complex */
-        float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
-        float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(
-            kg, &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-        if (result == VOLUME_PATH_SCATTERED &&
-            kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(
-              kg, indirect_sd, emission_sd, &pray, tp * num_samples_inv, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-    }
-
-    /* emission and transmittance */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-    *throughput *= volume_segment.accum_transmittance;
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-  }
-  else
-#      endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* GPU: no decoupled ray marching, scatter probabilistically. */
-    int num_samples = kernel_data.integrator.volume_samples;
-    float num_samples_inv = 1.0f / num_samples;
-
-    /* todo: we should cache the shader evaluations from stepping
-     * through the volume, for now we redo them multiple times */
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      Ray pray = *ray;
-      float3 tp = (*throughput) * num_samples_inv;
-
-      /* branch RNG state */
-      path_state_branch(&ps, j, num_samples);
-
-      VolumeIntegrateResult result = kernel_volume_integrate(
-          kg, &ps, sd, &volume_ray, L, &tp, step_size);
-
-#      ifdef __VOLUME_SCATTER__
-      if (result == VOLUME_PATH_SCATTERED) {
-        /* todo: support equiangular, MIS and all light sampling.
-         * alternatively get decoupled ray marching working on the GPU */
-        kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
-
-        if (kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(kg, indirect_sd, emission_sd, &pray, tp, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-#      endif /* __VOLUME_SCATTER__ */
-    }
-
-    /* todo: avoid this calculation using decoupled ray marching */
-    kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
-  }
-}
-#    endif /* __VOLUME__ */
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-                                                                         ShaderData *sd,
-                                                                         ShaderData *indirect_sd,
-                                                                         ShaderData *emission_sd,
-                                                                         float3 throughput,
-                                                                         float num_samples_adjust,
-                                                                         PathState *state,
-                                                                         PathRadiance *L)
-{
-  float sum_sample_weight = 0.0f;
-#    ifdef __DENOISING_FEATURES__
-  if (state->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#    endif /* __DENOISING_FEATURES__ */
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    /* transparency is not handled here, but in outer loop */
-    if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-      continue;
-    }
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      float3 tp = throughput;
-      Ray bsdf_ray;
-#    ifdef __SHADOW_TRICKS__
-      float shadow_transparency = L->shadow_transparency;
-#    endif
-
-      ps.rng_hash = cmj_hash(state->rng_hash, i);
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, &tp, &ps, &L->state, &bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps.rng_hash = state->rng_hash;
-
-      kernel_path_indirect(
-          kg, indirect_sd, emission_sd, &bsdf_ray, tp * num_samples_inv, &ps, L, sd->object);
-
-      /* for render passes, sum and reset indirect light pass variables
-       * for the next samples */
-      path_radiance_sum_indirect(L);
-      path_radiance_reset_indirect(L);
-
-#    ifdef __SHADOW_TRICKS__
-      L->shadow_transparency = shadow_transparency;
-#    endif
-    }
-  }
-}
-
-#    ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 throughput)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* set up random number generator */
-    uint lcg_state = lcg_state_init(state, 0x68bc21eb);
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = 0; j < num_samples; j++) {
-      PathState hit_state = *state;
-      path_state_branch(&hit_state, j, num_samples);
-      hit_state.rng_hash = bssrdf_rng_hash;
-
-      LocalIntersection ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-      int num_hits = subsurface_scatter_multi_intersect(
-          kg, &ss_isect, sd, &hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-      hit_state.rng_offset += PRNG_BOUNCE_NUM;
-
-#      ifdef __VOLUME__
-      Ray volume_ray = *ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#      endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = 0; hit < num_hits; hit++) {
-        ShaderData bssrdf_sd = *sd;
-        Bssrdf *bssrdf = (Bssrdf *)sc;
-        ClosureType bssrdf_type = sc->type;
-        float bssrdf_roughness = bssrdf->roughness;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect, hit, &bssrdf_sd, &hit_state, bssrdf_type, bssrdf_roughness);
-
-#      ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state.volume_stack[k] = state->volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state.volume_stack);
-        }
-#      endif /* __VOLUME__ */
-
-#      ifdef __EMISSION__
-        /* direct light */
-        if (kernel_data.integrator.use_direct_light) {
-          int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                    (hit_state.flag & PATH_RAY_SHADOW_CATCHER);
-          kernel_branched_path_surface_connect_light(
-              kg, &bssrdf_sd, emission_sd, &hit_state, throughput, num_samples_inv, L, all);
-        }
-#      endif /* __EMISSION__ */
-
-        /* indirect light */
-        kernel_branched_path_surface_indirect_light(
-            kg, &bssrdf_sd, indirect_sd, emission_sd, throughput, num_samples_inv, &hit_state, L);
-      }
-    }
-  }
-}
-#    endif /* __SUBSURFACE__ */
-
-ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
-                                               uint rng_hash,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer,
-                                               PathRadiance *L)
-{
-  /* initialize */
-  float3 throughput = one_float3();
-
-  path_radiance_init(kg, L);
-
-  /* shader data memory used for both volumes and surfaces, saves stack space */
-  ShaderData sd;
-  /* shader data used by emission, shadows, volume stacks, indirect path */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-  /* Main Loop
-   * Here we only handle transparency intersections from the camera ray.
-   * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
-   */
-  for (;;) {
-    /* Find intersection with objects in scene. */
-    Intersection isect;
-    bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L, sd.object);
-
-#    ifdef __VOLUME__
-    /* Volume integration. */
-    kernel_branched_path_volume(
-        kg, &sd, &state, &ray, &throughput, &isect, hit, &indirect_sd, emission_sd, L);
-#    endif /* __VOLUME__ */
-
-    /* Shade background. */
-    if (!hit) {
-      kernel_path_background(kg, &state, &ray, throughput, &sd, buffer, L);
-      break;
-    }
-
-    /* Setup and evaluate shader. */
-    shader_setup_from_ray(kg, &sd, &isect, &ray);
-
-    /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-    if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-      shader_eval_surface(kg, &sd, &state, buffer, state.flag);
-      shader_merge_closures(&sd);
-
-      /* Apply shadow catcher, holdout, emission. */
-      if (!kernel_path_shader_apply(kg, &sd, &state, &ray, throughput, emission_sd, L, buffer)) {
-        break;
-      }
-
-      /* transparency termination */
-      if (state.flag & PATH_RAY_TRANSPARENT) {
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, &state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-      }
-
-#    ifdef __DENOISING_FEATURES__
-      kernel_update_denoising_features(kg, &sd, &state, L);
-#    endif
-
-#    ifdef __AO__
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput);
-      }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-      /* bssrdf scatter to a different location on the same object */
-      if (sd.flag & SD_BSSRDF) {
-        kernel_branched_path_subsurface_scatter(
-            kg, &sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-      }
-#    endif /* __SUBSURFACE__ */
-
-      PathState hit_state = state;
-
-#    ifdef __EMISSION__
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                  (state.flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, &sd, emission_sd, &hit_state, throughput, 1.0f, L, all);
-      }
-#    endif /* __EMISSION__ */
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, &sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L);
-
-      /* continue in case of transparency */
-      throughput *= shader_bsdf_transparency(kg, &sd);
-
-      if (is_zero(throughput))
-        break;
-
-      /* Update Path State */
-      path_state_next(kg, &state, LABEL_TRANSPARENT);
-
-#    ifdef __VOLUME__
-    }
-    else {
-      if (!path_state_volume_next(kg, &state)) {
-        break;
-      }
-    }
-#    endif
-
-    ray.P = ray_offset(sd.P, -sd.Ng);
-    ray.t -= sd.ray_length; /* clipping works through transparent */
-
-#    ifdef __RAY_DIFFERENTIALS__
-    ray.dP = sd.dP;
-    ray.dD.dx = -sd.dI.dx;
-    ray.dD.dy = -sd.dI.dy;
-#    endif /* __RAY_DIFFERENTIALS__ */
-
-#    ifdef __VOLUME__
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#    endif /* __VOLUME__ */
-  }
-}
-
-ccl_device void kernel_branched_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* initialize random numbers and ray */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  /* integrate */
-  PathRadiance L;
-
-  if (ray.t != 0.0f) {
-    kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
-    kernel_write_result(kg, buffer, sample, &L);
-  }
-}
-
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
deleted file mode 100644
index 815767595a9..00000000000
--- a/intern/cycles/kernel/kernel_path_common.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_hash.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_path_trace_setup(
-    KernelGlobals *kg, int sample, int x, int y, uint *rng_hash, ccl_addr_space Ray *ray)
-{
-  float filter_u;
-  float filter_v;
-
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
-
-  /* sample camera ray */
-
-  float lens_u = 0.0f, lens_v = 0.0f;
-
-  if (kernel_data.cam.aperturesize > 0.0f)
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-  float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-  if (kernel_data.cam.shuttertime != -1.0f)
-    time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
-#endif
-
-  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index bf601580cd0..ebb2c0df4f1 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -14,99 +14,116 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
+#pragma once
 
-ccl_device_inline void path_state_init(KernelGlobals *kg,
-                                       ShaderData *stack_sd,
-                                       ccl_addr_space PathState *state,
-                                       uint rng_hash,
-                                       int sample,
-                                       ccl_addr_space Ray *ray)
-{
-  state->flag = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | PATH_RAY_TRANSPARENT_BACKGROUND;
+#include "kernel_random.h"
 
-  state->rng_hash = rng_hash;
-  state->rng_offset = PRNG_BASE_NUM;
-  state->sample = sample;
-  state->num_samples = kernel_data.integrator.aa_samples;
-  state->branch_factor = 1.0f;
+CCL_NAMESPACE_BEGIN
 
-  state->bounce = 0;
-  state->diffuse_bounce = 0;
-  state->glossy_bounce = 0;
-  state->transmission_bounce = 0;
-  state->transparent_bounce = 0;
+/* Initialize queues, so that the this path is considered terminated.
+ * Used for early outputs in the camera ray initialization, as well as initialization of split
+ * states for shadow catcher. */
+ccl_device_inline void path_state_init_queues(INTEGRATOR_STATE_ARGS)
+{
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
 
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-    state->flag |= PATH_RAY_STORE_SHADOW_INFO;
-    state->denoising_feature_weight = 1.0f;
-    state->denoising_feature_throughput = one_float3();
-  }
-  else {
-    state->denoising_feature_weight = 0.0f;
-    state->denoising_feature_throughput = zero_float3();
-  }
-#endif /* __DENOISING_FEATURES__ */
+/* Minimalistic initialization of the path state, which is needed for early outputs in the
+ * integrator initialization to work. */
+ccl_device_inline void path_state_init(INTEGRATOR_STATE_ARGS,
+                                       const ccl_global KernelWorkTile *ccl_restrict tile,
+                                       const int x,
+                                       const int y)
+{
+  const uint render_pixel_index = (uint)tile->offset + x + y * tile->stride;
 
-  state->min_ray_pdf = FLT_MAX;
-  state->ray_pdf = 0.0f;
-#ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#endif
+  INTEGRATOR_STATE_WRITE(path, render_pixel_index) = render_pixel_index;
 
-#ifdef __VOLUME__
-  state->volume_bounce = 0;
-  state->volume_bounds_bounce = 0;
+  path_state_init_queues(INTEGRATOR_STATE_PASS);
+}
 
-  if (kernel_data.integrator.use_volumes) {
-    /* Initialize volume stack with volume we are inside of. */
-    kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
+/* Initialize the rest of the path state needed to continue the path integration. */
+ccl_device_inline void path_state_init_integrator(INTEGRATOR_STATE_ARGS,
+                                                  const int sample,
+                                                  const uint rng_hash)
+{
+  INTEGRATOR_STATE_WRITE(path, sample) = sample;
+  INTEGRATOR_STATE_WRITE(path, bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, glossy_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transmission_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, rng_hash) = rng_hash;
+  INTEGRATOR_STATE_WRITE(path, rng_offset) = PRNG_BASE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
+                                       PATH_RAY_TRANSPARENT_BACKGROUND;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = kernel_data.background.volume_shader;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
   }
-  else {
-    state->volume_stack[0].shader = SHADER_NONE;
+
+#ifdef __DENOISING_FEATURES__
+  if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
+    INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_DENOISING_FEATURES;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) = one_float3();
   }
 #endif
 }
 
-ccl_device_inline void path_state_next(KernelGlobals *kg,
-                                       ccl_addr_space PathState *state,
-                                       int label)
+ccl_device_inline void path_state_next(INTEGRATOR_STATE_ARGS, int label)
 {
+  uint32_t flag = INTEGRATOR_STATE(path, flag);
+
   /* ray through transparent keeps same flags from previous ray and is
    * not counted as a regular bounce, transparent has separate max */
   if (label & LABEL_TRANSPARENT) {
-    state->flag |= PATH_RAY_TRANSPARENT;
-    state->transparent_bounce++;
-    if (state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_IMMEDIATE;
+    uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+
+    flag |= PATH_RAY_TRANSPARENT;
+    if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+      flag |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
     }
 
     if (!kernel_data.integrator.transparent_shadows)
-      state->flag |= PATH_RAY_MIS_SKIP;
-
-    /* random number generator next bounce */
-    state->rng_offset += PRNG_BOUNCE_NUM;
+      flag |= PATH_RAY_MIS_SKIP;
 
+    INTEGRATOR_STATE_WRITE(path, flag) = flag;
+    INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+    /* Random number generator next bounce. */
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
     return;
   }
 
-  state->bounce++;
-  if (state->bounce >= kernel_data.integrator.max_bounce) {
-    state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+  uint32_t bounce = INTEGRATOR_STATE(path, bounce) + 1;
+  if (bounce >= kernel_data.integrator.max_bounce) {
+    flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
   }
 
-  state->flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
+  flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
 
 #ifdef __VOLUME__
   if (label & LABEL_VOLUME_SCATTER) {
     /* volume scatter */
-    state->flag |= PATH_RAY_VOLUME_SCATTER;
-    state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    flag |= PATH_RAY_VOLUME_SCATTER;
+    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    if (bounce == 1) {
+      flag |= PATH_RAY_VOLUME_PASS;
+    }
 
-    state->volume_bounce++;
-    if (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    const int volume_bounce = INTEGRATOR_STATE(path, volume_bounce) + 1;
+    INTEGRATOR_STATE_WRITE(path, volume_bounce) = volume_bounce;
+    if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+      flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
     }
   }
   else
@@ -114,163 +131,237 @@ ccl_device_inline void path_state_next(KernelGlobals *kg,
   {
     /* surface reflection/transmission */
     if (label & LABEL_REFLECT) {
-      state->flag |= PATH_RAY_REFLECT;
-      state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+      flag |= PATH_RAY_REFLECT;
+      flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
 
       if (label & LABEL_DIFFUSE) {
-        state->diffuse_bounce++;
-        if (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int diffuse_bounce = INTEGRATOR_STATE(path, diffuse_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = diffuse_bounce;
+        if (diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
       else {
-        state->glossy_bounce++;
-        if (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int glossy_bounce = INTEGRATOR_STATE(path, glossy_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, glossy_bounce) = glossy_bounce;
+        if (glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
     }
     else {
       kernel_assert(label & LABEL_TRANSMIT);
 
-      state->flag |= PATH_RAY_TRANSMIT;
+      flag |= PATH_RAY_TRANSMIT;
 
       if (!(label & LABEL_TRANSMIT_TRANSPARENT)) {
-        state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+        flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
       }
 
-      state->transmission_bounce++;
-      if (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
-        state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+      const int transmission_bounce = INTEGRATOR_STATE(path, transmission_bounce) + 1;
+      INTEGRATOR_STATE_WRITE(path, transmission_bounce) = transmission_bounce;
+      if (transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
+        flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
       }
     }
 
     /* diffuse/glossy/singular */
     if (label & LABEL_DIFFUSE) {
-      state->flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
+      flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
     }
     else if (label & LABEL_GLOSSY) {
-      state->flag |= PATH_RAY_GLOSSY;
+      flag |= PATH_RAY_GLOSSY;
     }
     else {
       kernel_assert(label & LABEL_SINGULAR);
-      state->flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+      flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+    }
+
+    /* Render pass categories. */
+    if (bounce == 1) {
+      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
     }
   }
 
-  /* random number generator next bounce */
-  state->rng_offset += PRNG_BOUNCE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = flag;
+  INTEGRATOR_STATE_WRITE(path, bounce) = bounce;
 
-#ifdef __DENOISING_FEATURES__
-  if ((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
-    state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
-  }
-#endif
+  /* Random number generator next bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
 }
 
 #ifdef __VOLUME__
-ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state)
+ccl_device_inline bool path_state_volume_next(INTEGRATOR_STATE_ARGS)
 {
   /* For volume bounding meshes we pass through without counting transparent
    * bounces, only sanity check in case self intersection gets us stuck. */
-  state->volume_bounds_bounce++;
-  if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
+  uint32_t volume_bounds_bounce = INTEGRATOR_STATE(path, volume_bounds_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = volume_bounds_bounce;
+  if (volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
     return false;
   }
 
   /* Random number generator next bounce. */
-  if (state->volume_bounds_bounce > 1) {
-    state->rng_offset += PRNG_BOUNCE_NUM;
+  if (volume_bounds_bounce > 1) {
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
   }
 
   return true;
 }
 #endif
 
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state)
+ccl_device_inline uint path_state_ray_visibility(INTEGRATOR_STATE_CONST_ARGS)
 {
-  uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
 
-  /* for visibility, diffuse/glossy are for reflection only */
-  if (flag & PATH_RAY_TRANSMIT)
-    flag &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
-  /* todo: this is not supported as its own ray visibility yet */
-  if (state->flag & PATH_RAY_VOLUME_SCATTER)
-    flag |= PATH_RAY_DIFFUSE;
+  uint32_t visibility = path_flag & PATH_RAY_ALL_VISIBILITY;
 
-  return flag;
+  /* For visibility, diffuse/glossy are for reflection only. */
+  if (visibility & PATH_RAY_TRANSMIT) {
+    visibility &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
+  }
+
+  /* todo: this is not supported as its own ray visibility yet. */
+  if (path_flag & PATH_RAY_VOLUME_SCATTER) {
+    visibility |= PATH_RAY_DIFFUSE;
+  }
+
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+
+  return visibility;
 }
 
-ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
-                                                            ccl_addr_space PathState *state,
-                                                            const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(INTEGRATOR_STATE_CONST_ARGS,
+                                                            const uint32_t path_flag)
 {
-  if (state->flag & PATH_RAY_TERMINATE_IMMEDIATE) {
-    /* Ray is to be terminated immediately. */
-    return 0.0f;
-  }
-  else if (state->flag & PATH_RAY_TRANSPARENT) {
+  if (path_flag & PATH_RAY_TRANSPARENT) {
+    const uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
-      return 1.0f;
-    }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+    if (transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
       return 1.0f;
     }
-#endif
   }
   else {
+    const uint32_t bounce = INTEGRATOR_STATE(path, bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->bounce <= kernel_data.integrator.min_bounce) {
+    if (bounce <= kernel_data.integrator.min_bounce) {
       return 1.0f;
     }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
-      return 1.0f;
-    }
-#endif
   }
 
   /* Probabilistic termination: use sqrt() to roughly match typical view
    * transform and do path termination a bit later on average. */
-  return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
+  return min(sqrtf(max3(fabs(INTEGRATOR_STATE(path, throughput)))), 1.0f);
 }
 
-/* TODO(DingTo): Find more meaningful name for this */
-ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, bool increase)
+ccl_device_inline bool path_state_ao_bounce(INTEGRATOR_STATE_CONST_ARGS)
 {
-  /* Modify bounce temporarily for shader eval */
-  if (increase)
-    state->bounce += 1;
-  else
-    state->bounce -= 1;
-}
-
-ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
-{
-  if (state->bounce <= kernel_data.integrator.ao_bounces) {
+  if (!kernel_data.integrator.ao_bounces) {
     return false;
   }
 
-  int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+  const int bounce = INTEGRATOR_STATE(path, bounce) - INTEGRATOR_STATE(path, transmission_bounce) -
+                     (INTEGRATOR_STATE(path, glossy_bounce) > 0) + 1;
   return (bounce > kernel_data.integrator.ao_bounces);
 }
 
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
-                                         int branch,
-                                         int num_branches)
+/* Random Number Sampling Utility Functions
+ *
+ * For each random number in each step of the path we must have a unique
+ * dimension to avoid using the same sequence twice.
+ *
+ * For branches in the path we must be careful not to reuse the same number
+ * in a sequence and offset accordingly.
+ */
+
+/* RNG State loaded onto stack. */
+typedef struct RNGState {
+  uint rng_hash;
+  uint rng_offset;
+  int sample;
+} RNGState;
+
+ccl_device_inline void path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset);
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline void shadow_path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  const uint shadow_bounces = INTEGRATOR_STATE(shadow_path, transparent_bounce) -
+                              INTEGRATOR_STATE(path, transparent_bounce);
+
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset) + PRNG_BOUNCE_NUM * shadow_bounces;
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline float path_state_rng_1D(const KernelGlobals *kg,
+                                          const RNGState *rng_state,
+                                          int dimension)
+{
+  return path_rng_1D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(
+    const KernelGlobals *kg, const RNGState *rng_state, int dimension, float *fx, float *fy)
+{
+  path_rng_2D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
+}
+
+ccl_device_inline float path_state_rng_1D_hash(const KernelGlobals *kg,
+                                               const RNGState *rng_state,
+                                               uint hash)
+{
+  /* Use a hash instead of dimension, this is not great but avoids adding
+   * more dimensions to each bounce which reduces quality of dimensions we
+   * are already using. */
+  return path_rng_1D(
+      kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+}
+
+ccl_device_inline float path_branched_rng_1D(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             int branch,
+                                             int num_branches,
+                                             int dimension)
+{
+  return path_rng_1D(kg,
+                     rng_state->rng_hash,
+                     rng_state->sample * num_branches + branch,
+                     rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(const KernelGlobals *kg,
+                                            const RNGState *rng_state,
+                                            int branch,
+                                            int num_branches,
+                                            int dimension,
+                                            float *fx,
+                                            float *fy)
+{
+  path_rng_2D(kg,
+              rng_state->rng_hash,
+              rng_state->sample * num_branches + branch,
+              rng_state->rng_offset + dimension,
+              fx,
+              fy);
+}
+
+/* Utility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(const KernelGlobals *kg,
+                                                         const RNGState *state)
 {
-  if (num_branches > 1) {
-    /* Path is splitting into a branch, adjust so that each branch
-     * still gets a unique sample from the same sequence. */
-    state->sample = state->sample * num_branches + branch;
-    state->num_samples = state->num_samples * num_branches;
-    state->branch_factor *= num_branches;
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
   }
+  return 0.0f;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
deleted file mode 100644
index 97d3f292ca3..00000000000
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    bool
-    kernel_path_subsurface_scatter(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   ShaderData *emission_sd,
-                                   PathRadiance *L,
-                                   ccl_addr_space PathState *state,
-                                   ccl_addr_space Ray *ray,
-                                   ccl_addr_space float3 *throughput,
-                                   ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  PROFILING_INIT(kg, PROFILING_SUBSURFACE);
-
-  float bssrdf_u, bssrdf_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-  const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
-
-  /* do bssrdf scatter step if we picked a bssrdf closure */
-  if (sc) {
-    /* We should never have two consecutive BSSRDF bounces,
-     * the second one should be converted to a diffuse BSDF to
-     * avoid this.
-     */
-    kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
-
-    uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
-
-    LocalIntersection ss_isect;
-    int num_hits = subsurface_scatter_multi_intersect(
-        kg, &ss_isect, sd, state, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-#  ifdef __VOLUME__
-    bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                    sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* compute lighting with the BSDF closure */
-    for (int hit = 0; hit < num_hits; hit++) {
-      /* NOTE: We reuse the existing ShaderData, we assume the path
-       * integration loop stops when this function returns true.
-       */
-      subsurface_scatter_multi_setup(kg, &ss_isect, hit, sd, state, bssrdf_type, bssrdf_roughness);
-
-      kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-      ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-      ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-      PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
-
-      *hit_state = *state;
-      *hit_ray = *ray;
-      *hit_tp = *throughput;
-      *hit_L_state = L->state;
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-      if (kernel_path_surface_bounce(kg, sd, hit_tp, hit_state, hit_L_state, hit_ray)) {
-#  ifdef __LAMP_MIS__
-        hit_state->ray_t = 0.0f;
-#  endif /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          Ray volume_ray = *ray;
-          /* Setup ray from previous surface point to the new one. */
-          volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, &volume_ray.t);
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-        ss_indirect->num_rays++;
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
-    KernelGlobals *kg,
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
-    ccl_addr_space PathState *state,
-    ccl_addr_space Ray *ray,
-    PathRadiance *L,
-    ccl_addr_space float3 *throughput)
-{
-  /* Setup state, ray and throughput for indirect SSS rays. */
-  ss_indirect->num_rays--;
-
-  path_radiance_sum_indirect(L);
-  path_radiance_reset_indirect(L);
-
-  *state = ss_indirect->state[ss_indirect->num_rays];
-  *ray = ss_indirect->rays[ss_indirect->num_rays];
-  L->state = ss_indirect->L_state[ss_indirect->num_rays];
-  *throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-  state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif /* __SUBSURFACE__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
deleted file mode 100644
index ba48c0bdfc4..00000000000
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || \
-    defined(__BAKING__)
-/* branched path tracing: connect path directly to position on one or more lights and add it to L
- */
-ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ShaderData *emission_sd,
-    ccl_addr_space PathState *state,
-    float3 throughput,
-    float num_samples_adjust,
-    PathRadiance *L,
-    int sample_all_lights)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 0;
-  if (kernel_data.integrator.use_direct_light) {
-    if (sample_all_lights) {
-      num_lights = kernel_data.integrator.num_all_lights;
-      if (kernel_data.integrator.pdf_triangles != 0.0f) {
-        num_lights += 1;
-      }
-    }
-    else {
-      num_lights = 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; i++) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = ceil_to_int(num_samples_adjust * light_select_num_samples(kg, i));
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = ceil_to_int(num_samples_adjust * kernel_data.integrator.mesh_light_samples);
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = num_samples_adjust / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#    ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#    endif
-      bool has_emission = false;
-
-      if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-        float terminate = path_branched_rng_light_termination(
-            kg, lamp_rng_hash, state, j, num_samples);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-          /* The sampling probability returned by lamp_light_sample assumes that all lights were
-           * sampled. However, this code only samples lamps, so if the scene also had mesh lights,
-           * the real probability is twice as high. */
-          if (double_pdf) {
-            ls.pdf *= 2.0f;
-          }
-
-          has_emission = direct_emission(
-              kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission) {
-        if (!blocked) {
-          /* accumulate */
-          path_radiance_accum_light(kg,
-                                    L,
-                                    state,
-                                    throughput * num_samples_inv,
-                                    &L_light,
-                                    shadow,
-                                    num_samples_inv,
-                                    is_lamp);
-        }
-        else {
-          path_radiance_accum_total_light(L, state, throughput * num_samples_inv, &L_light);
-        }
-      }
-    }
-  }
-#  endif
-}
-
-/* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const ShaderClosure *sc,
-                                                    int sample,
-                                                    int num_samples,
-                                                    ccl_addr_space float3 *throughput,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadianceState *L_state,
-                                                    ccl_addr_space Ray *ray,
-                                                    float sum_sample_weight)
-{
-  /* sample BSDF */
-  float bsdf_pdf;
-  BsdfEval bsdf_eval ccl_optional_struct_init;
-  float3 bsdf_omega_in ccl_optional_struct_init;
-  differential3 bsdf_domega_in ccl_optional_struct_init;
-  float bsdf_u, bsdf_v;
-  path_branched_rng_2D(
-      kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-  int label;
-
-  label = shader_bsdf_sample_closure(
-      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-#  ifdef __DENOISING_FEATURES__
-  state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
-#  endif
-
-  /* modify path state */
-  path_state_next(kg, state, label);
-
-  /* setup ray */
-  ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-  ray->D = normalize(bsdf_omega_in);
-  ray->t = FLT_MAX;
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = bsdf_domega_in;
-#  endif
-#  ifdef __OBJECT_MOTION__
-  ray->time = sd->time;
-#  endif
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  if (label & LABEL_TRANSMIT)
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif
-
-  /* branch RNG state */
-  path_state_branch(state, sample, num_samples);
-
-  /* set MIS state */
-  state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
-  state->ray_pdf = bsdf_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-
-  return true;
-}
-
-#endif
-
-/* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
-                                                         ShaderData *sd,
-                                                         ShaderData *emission_sd,
-                                                         float3 throughput,
-                                                         ccl_addr_space PathState *state,
-                                                         PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
-
-#ifdef __EMISSION__
-#  ifdef __SHADOW_TRICKS__
-  int all = (state->flag & PATH_RAY_SHADOW_CATCHER);
-  kernel_branched_path_surface_connect_light(kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#  else
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission) {
-    if (!blocked) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-#  endif
-#endif
-}
-
-/* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           ccl_addr_space float3 *throughput,
-                                           ccl_addr_space PathState *state,
-                                           PathRadianceState *L_state,
-                                           ccl_addr_space Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
-
-  /* no BSDF? we can stop here */
-  if (sd->flag & SD_BSDF) {
-    /* sample BSDF */
-    float bsdf_pdf;
-    BsdfEval bsdf_eval ccl_optional_struct_init;
-    float3 bsdf_omega_in ccl_optional_struct_init;
-    differential3 bsdf_domega_in ccl_optional_struct_init;
-    float bsdf_u, bsdf_v;
-    path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-    int label;
-
-    label = shader_bsdf_sample(
-        kg, sd, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-    if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-      return false;
-
-    /* modify throughput */
-    path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-    /* set labels */
-    if (!(label & LABEL_TRANSPARENT)) {
-      state->ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-      state->ray_t = 0.0f;
-#endif
-      state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
-    }
-
-    /* update path state */
-    path_state_next(kg, state, label);
-
-    /* setup ray */
-    ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-    ray->D = normalize(bsdf_omega_in);
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-    ray->dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
-    /* enter/exit volume */
-    if (label & LABEL_TRANSMIT)
-      kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#endif
-    return true;
-  }
-#ifdef __VOLUME__
-  else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-    if (!path_state_volume_next(kg, state)) {
-      return false;
-    }
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-    /* setup ray position, direction stays unchanged */
-    ray->P = ray_offset(sd->P, -sd->Ng);
-#  ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-#  endif
-
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-    return true;
-  }
-#endif
-  else {
-    /* no bsdf or volume? */
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
deleted file mode 100644
index a787910e65c..00000000000
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME_SCATTER__
-
-ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *emission_sd,
-                                                        float3 throughput,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  /* connect to light from given point where shader has been evaluated */
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission && !blocked) {
-    /* accumulate */
-    path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-  }
-#  endif /* __EMISSION__ */
-}
-
-ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space float3 *throughput,
-                                                       ccl_addr_space PathState *state,
-                                                       PathRadianceState *L_state,
-                                                       ccl_addr_space Ray *ray)
-{
-  /* sample phase function */
-  float phase_pdf;
-  BsdfEval phase_eval ccl_optional_struct_init;
-  float3 phase_omega_in ccl_optional_struct_init;
-  differential3 phase_domega_in ccl_optional_struct_init;
-  float phase_u, phase_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
-  int label;
-
-  label = shader_volume_phase_sample(
-      kg, sd, phase_u, phase_v, &phase_eval, &phase_omega_in, &phase_domega_in, &phase_pdf);
-
-  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
-
-  /* set labels */
-  state->ray_pdf = phase_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-  state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
-
-  /* update path state */
-  path_state_next(kg, state, label);
-
-  /* Russian roulette termination of volume ray scattering. */
-  float probability = path_state_continuation_probability(kg, state, *throughput);
-
-  if (probability == 0.0f) {
-    return false;
-  }
-  else if (probability != 1.0f) {
-    /* Use dimension from the previous bounce, has not been used yet. */
-    float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM);
-
-    if (terminate >= probability) {
-      return false;
-    }
-
-    *throughput /= probability;
-  }
-
-  /* setup ray */
-  ray->P = sd->P;
-  ray->D = phase_omega_in;
-  ray->t = FLT_MAX;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = phase_domega_in;
-#  endif
-
-  return true;
-}
-
-#  if !defined(__SPLIT_KERNEL__) && (defined(__BRANCHED_PATH__) || defined(__VOLUME_DECOUPLED__))
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
-                                                          ShaderData *sd,
-                                                          ShaderData *emission_sd,
-                                                          float3 throughput,
-                                                          ccl_addr_space PathState *state,
-                                                          PathRadiance *L,
-                                                          bool sample_all_lights,
-                                                          Ray *ray,
-                                                          const VolumeSegment *segment)
-{
-#    ifdef __EMISSION__
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 1;
-  if (sample_all_lights) {
-    num_lights = kernel_data.integrator.num_all_lights;
-    if (kernel_data.integrator.pdf_triangles != 0.0f) {
-      num_lights += 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; ++i) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = light_select_num_samples(kg, i);
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = kernel_data.integrator.mesh_light_samples;
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = 1.0f / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#      ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#      endif
-      bool has_emission = false;
-
-      float3 tp = throughput;
-
-      if (kernel_data.integrator.use_direct_light) {
-        /* sample random position on random light/triangle */
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        light_sample(kg, lamp, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
-
-        /* sample position on volume segment */
-        float rphase = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
-        float rscatter = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-                                                                       state,
-                                                                       ray,
-                                                                       sd,
-                                                                       &tp,
-                                                                       rphase,
-                                                                       rscatter,
-                                                                       segment,
-                                                                       (ls.t != FLT_MAX) ? &ls.P :
-                                                                                           NULL,
-                                                                       false);
-
-        if (result == VOLUME_PATH_SCATTERED) {
-          /* todo: split up light_sample so we don't have to call it again with new position */
-          if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-            if (double_pdf) {
-              ls.pdf *= 2.0f;
-            }
-
-            /* sample random light */
-            float terminate = path_branched_rng_light_termination(
-                kg, state->rng_hash, state, j, num_samples);
-            has_emission = direct_emission(
-                kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-          }
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission && !blocked) {
-        /* accumulate */
-        path_radiance_accum_light(
-            kg, L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
-      }
-    }
-  }
-#    endif /* __EMISSION__ */
-}
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __VOLUME_SCATTER__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
index 780830879d8..db8644005ea 100644
--- a/intern/cycles/kernel/kernel_profiling.h
+++ b/intern/cycles/kernel/kernel_profiling.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_PROFILING_H__
-#define __KERNEL_PROFILING_H__
+#pragma once
 
 #ifdef __KERNEL_CPU__
 #  include "util/util_profiling.h"
@@ -24,23 +23,18 @@
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_CPU__
-#  define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+#  define PROFILING_INIT(kg, event) \
+    ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event)
 #  define PROFILING_EVENT(event) profiling_helper.set_event(event)
-#  define PROFILING_SHADER(shader) \
-    if ((shader) != SHADER_NONE) { \
-      profiling_helper.set_shader((shader)&SHADER_MASK); \
-    }
-#  define PROFILING_OBJECT(object) \
-    if ((object) != PRIM_NONE) { \
-      profiling_helper.set_object(object); \
-    }
+#  define PROFILING_INIT_FOR_SHADER(kg, event) \
+    ProfilingWithShaderHelper profiling_helper((ProfilingState *)&kg->profiler, event)
+#  define PROFILING_SHADER(object, shader) \
+    profiling_helper.set_shader(object, (shader)&SHADER_MASK);
 #else
 #  define PROFILING_INIT(kg, event)
 #  define PROFILING_EVENT(event)
-#  define PROFILING_SHADER(shader)
-#  define PROFILING_OBJECT(object)
+#  define PROFILING_INIT_FOR_SHADER(kg, event)
+#  define PROFILING_SHADER(object, shader)
 #endif /* __KERNEL_CPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROFILING_H__ */
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index c33d7150b5c..192bf7ca5aa 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_PROJECTION_CL__
-#define __KERNEL_PROJECTION_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -257,5 +256,3 @@ ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROJECTION_CL__ */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
deleted file mode 100644
index d8cc08b3e85..00000000000
--- a/intern/cycles/kernel/kernel_queues.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_QUEUE_H__
-#define __KERNEL_QUEUE_H__
-
-CCL_NAMESPACE_BEGIN
-
-/*
- * Queue utility functions for split kernel
- */
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#  pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#endif
-
-/*
- * Enqueue ray index into the queue
- */
-ccl_device void enqueue_ray_index(
-    int ray_index,               /* Ray index to be enqueued. */
-    int queue_number,            /* Queue in which the ray index should be enqueued. */
-    ccl_global int *queues,      /* Buffer of all queues. */
-    int queue_size,              /* Size of each queue. */
-    ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
-{
-  /* This thread's queue index. */
-  int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint *)&queue_index[queue_number]) +
-                       (queue_number * queue_size);
-  queues[my_queue_index] = ray_index;
-}
-
-/*
- * Get the ray index for this thread
- * Returns a positive ray_index for threads that have to do some work;
- * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
- * i.e All ray's in the queue has been successfully allocated and there
- * is no more ray to allocate to other threads.
- */
-ccl_device int get_ray_index(
-    KernelGlobals *kg,
-    int thread_index,       /* Global thread index. */
-    int queue_number,       /* Queue to operate on. */
-    ccl_global int *queues, /* Buffer of all queues. */
-    int queuesize,          /* Size of a queue. */
-    int empty_queue)        /* Empty the queue slot as soon as we fetch the ray index. */
-{
-  int ray_index = queues[queue_number * queuesize + thread_index];
-  if (empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
-    queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-  }
-  return ray_index;
-}
-
-/* The following functions are to realize Local memory variant of enqueue ray index function. */
-
-/* All threads should call this function. */
-ccl_device void enqueue_ray_index_local(
-    int ray_index,     /* Ray index to enqueue. */
-    int queue_number,  /* Queue in which to enqueue ray index. */
-    char enqueue_flag, /* True for threads whose ray index has to be enqueued. */
-    int queuesize,     /* queue size. */
-    ccl_local_param unsigned int *local_queue_atomics, /* To do local queue atomics. */
-    ccl_global int *Queue_data,                        /* Queues. */
-    ccl_global int *Queue_index)                       /* To do global queue atomics. */
-{
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-
-  /* Get local queue id. */
-  unsigned int lqidx;
-  if (enqueue_flag) {
-    lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue offset. */
-  if (lidx == 0) {
-    *local_queue_atomics = atomic_fetch_and_add_uint32(
-        (ccl_global uint *)&Queue_index[queue_number], *local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue index and enqueue ray. */
-  if (enqueue_flag) {
-    unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
-    Queue_data[my_gqidx] = ray_index;
-  }
-}
-
-ccl_device unsigned int get_local_queue_index(
-    int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-    ccl_local_param unsigned int *local_queue_atomics)
-{
-  int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
-  return my_lqidx;
-}
-
-ccl_device unsigned int get_global_per_queue_offset(
-    int queue_number,
-    ccl_local_param unsigned int *local_queue_atomics,
-    ccl_global int *global_queue_atomics)
-{
-  unsigned int queue_offset = atomic_fetch_and_add_uint32(
-      (ccl_global uint *)&global_queue_atomics[queue_number], local_queue_atomics[queue_number]);
-  return queue_offset;
-}
-
-ccl_device unsigned int get_global_queue_index(
-    int queue_number,
-    int queuesize,
-    unsigned int lqidx,
-    ccl_local_param unsigned int *global_per_queue_offset)
-{
-  int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
-  return my_gqidx;
-}
-
-ccl_device int dequeue_ray_index(int queue_number,
-                                 ccl_global int *queues,
-                                 int queue_size,
-                                 ccl_global int *queue_index)
-{
-  int index = atomic_fetch_and_dec_uint32((ccl_global uint *)&queue_index[queue_number]) - 1;
-
-  if (index < 0) {
-    return QUEUE_EMPTY_SLOT;
-  }
-
-  return queues[index + queue_number * queue_size];
-}
-
-CCL_NAMESPACE_END
-
-#endif  // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 49e5e25c2e0..41b7d76230a 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #include "kernel/kernel_jitter.h"
 #include "util/util_hash.h"
@@ -37,38 +38,34 @@ CCL_NAMESPACE_BEGIN
  */
 #  define SOBOL_SKIP 64
 
-ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
+ccl_device uint sobol_dimension(const KernelGlobals *kg, int index, int dimension)
 {
   uint result = 0;
   uint i = index + SOBOL_SKIP;
   for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
     j += x;
-    result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1);
+    result ^= __float_as_uint(kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1));
   }
   return result;
 }
 
 #endif /* __SOBOL__ */
 
-ccl_device_forceinline float path_rng_1D(
-    KernelGlobals *kg, uint rng_hash, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(const KernelGlobals *kg,
+                                         uint rng_hash,
+                                         int sample,
+                                         int dimension)
 {
 #ifdef __DEBUG_CORRELATION__
   return (float)drand48();
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    return pmj_sample_1D(kg, sample, rng_hash, dimension);
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    return cmj_sample_1D(sample, num_samples, p);
+    return pmj_sample_1D(kg, sample, rng_hash, dimension);
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol sequence value using direction vectors. */
@@ -88,68 +85,72 @@ ccl_device_forceinline float path_rng_1D(
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
-                                        uint rng_hash,
-                                        int sample,
-                                        int num_samples,
-                                        int dimension,
-                                        float *fx,
-                                        float *fy)
+ccl_device_forceinline void path_rng_2D(
+    const KernelGlobals *kg, uint rng_hash, int sample, int dimension, float *fx, float *fy)
 {
 #ifdef __DEBUG_CORRELATION__
   *fx = (float)drand48();
   *fy = (float)drand48();
   return;
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    const float2 f = pmj_sample_2D(kg, sample, rng_hash, dimension);
-    *fx = f.x;
-    *fy = f.y;
-    return;
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    cmj_sample_2D(sample, num_samples, p, fx, fy);
+    pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy);
+
     return;
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol. */
-  *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
-  *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
+  *fx = path_rng_1D(kg, rng_hash, sample, dimension);
+  *fy = path_rng_1D(kg, rng_hash, sample, dimension + 1);
 #endif
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg,
-                                     int sample,
-                                     int num_samples,
-                                     uint *rng_hash,
-                                     int x,
-                                     int y,
-                                     float *fx,
-                                     float *fy)
+/**
+ * 1D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqint1(uint n)
+{
+  n = (n << 13U) ^ n;
+  n = n * (n * n * 15731U + 789221U) + 1376312589U;
+
+  return n;
+}
+
+/**
+ * 2D hash recomended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqnt2d(const uint x, const uint y)
 {
-  /* load state */
-  *rng_hash = hash_uint2(x, y);
-  *rng_hash ^= kernel_data.integrator.seed;
+  const uint qx = 1103515245U * ((x >> 1U) ^ (y));
+  const uint qy = 1103515245U * ((y >> 1U) ^ (x));
+  const uint n = 1103515245U * ((qx) ^ (qy >> 3U));
+
+  return n;
+}
+
+ccl_device_inline uint path_rng_hash_init(const KernelGlobals *ccl_restrict kg,
+                                          const int sample,
+                                          const int x,
+                                          const int y)
+{
+  const uint rng_hash = hash_iqnt2d(x, y) ^ kernel_data.integrator.seed;
 
 #ifdef __DEBUG_CORRELATION__
-  srand48(*rng_hash + sample);
+  srand48(rng_hash + sample);
+#else
+  (void)sample;
 #endif
 
-  if (sample == 0) {
-    *fx = 0.5f;
-    *fy = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
-  }
+  return rng_hash;
 }
 
 /* Linear Congruential Generator */
@@ -175,113 +176,12 @@ ccl_device uint lcg_init(uint seed)
   return rng;
 }
 
-/* Path Tracing Utility Functions
- *
- * For each random number in each step of the path we must have a unique
- * dimension to avoid using the same sequence twice.
- *
- * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly.
- */
-
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
-                                          const ccl_addr_space PathState *state,
-                                          int dimension)
-{
-  return path_rng_1D(
-      kg, state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_state_rng_2D(
-    KernelGlobals *kg, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
-{
-  path_rng_2D(kg,
-              state->rng_hash,
-              state->sample,
-              state->num_samples,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg,
-                                               const ccl_addr_space PathState *state,
-                                               uint hash)
-{
-  /* Use a hash instead of dimension, this is not great but avoids adding
-   * more dimensions to each bounce which reduces quality of dimensions we
-   * are already using. */
-  return path_rng_1D(kg,
-                     cmj_hash_simple(state->rng_hash, hash),
-                     state->sample,
-                     state->num_samples,
-                     state->rng_offset);
-}
-
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg,
-                                             uint rng_hash,
-                                             const ccl_addr_space PathState *state,
-                                             int branch,
-                                             int num_branches,
-                                             int dimension)
-{
-  return path_rng_1D(kg,
-                     rng_hash,
-                     state->sample * num_branches + branch,
-                     state->num_samples * num_branches,
-                     state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg,
-                                            uint rng_hash,
-                                            const ccl_addr_space PathState *state,
-                                            int branch,
-                                            int num_branches,
-                                            int dimension,
-                                            float *fx,
-                                            float *fy)
-{
-  path_rng_2D(kg,
-              rng_hash,
-              state->sample * num_branches + branch,
-              state->num_samples * num_branches,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-/* Utility functions to get light termination value,
- * since it might not be needed in many cases.
- */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg,
-                                                         const ccl_addr_space PathState *state)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg,
-                                                            uint rng_hash,
-                                                            const ccl_addr_space PathState *state,
-                                                            int branch,
-                                                            int num_branches)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_branched_rng_1D(kg, rng_hash, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline uint lcg_state_init(PathState *state, uint scramble)
-{
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
-}
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(const uint rng_hash,
+                                      const uint rng_offset,
+                                      const uint sample,
+                                      const uint scramble)
 {
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
+  return lcg_init(rng_hash + rng_offset + sample * scramble);
 }
 
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
@@ -301,8 +201,6 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
     return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
 #elif defined(__NVCC__)
     return __popc(sample & 0xaaaaaaaa) & 1;
-#elif defined(__KERNEL_OPENCL__)
-    return popcount(sample & 0xaaaaaaaa) & 1;
 #else
     /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
     int i = sample & 0xaaaaaaaa;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 7f02e6fc7b3..3052bb53040 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 
-/*
- * ShaderData, used in four steps:
- *
- * Setup from incoming ray, sampled position and background.
- * Execute for surface, volume or displacement.
- * Evaluate one or more closures.
- * Release.
- */
+/* Functions to evaluate shaders and use the resulting shader closures. */
+
+#pragma once
 
 // clang-format off
 #include "kernel/closure/alloc.h"
@@ -30,479 +25,39 @@
 #include "kernel/closure/emissive.h"
 // clang-format on
 
+#include "kernel/kernel_accumulate.h"
 #include "kernel/svm/svm.h"
 
-CCL_NAMESPACE_BEGIN
-
-/* ShaderData setup from incoming ray */
-
-#ifdef __OBJECT_MOTION__
-ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
-{
-  if (sd->object_flag & SD_OBJECT_MOTION) {
-    sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-    sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
-  }
-  else {
-    sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-    sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  }
-}
-#endif
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-    void
-    shader_setup_from_ray(KernelGlobals *kg,
-                          ShaderData *sd,
-                          const Intersection *isect,
-                          const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
-                                                isect->object;
-  sd->lamp = LAMP_NONE;
-
-  sd->type = isect->type;
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-
-  /* matrices and time */
-#ifdef __OBJECT_MOTION__
-  shader_setup_object_transforms(kg, sd, ray->time);
-#endif
-  sd->time = ray->time;
-
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->ray_length = isect->t;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE) {
-    /* curve */
-    curve_shader_setup(kg, sd, isect, ray);
-  }
-  else
-#endif
-      if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* static triangle */
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* vectors */
-    sd->P = triangle_refine(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, false);
-  }
-
-  sd->I = -ray->D;
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#endif
-  }
-
-  /* backfacing test */
-  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#endif
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-  differential_incoming(&sd->dI, ray->dD);
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from BSSRDF scatter */
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    void
-    shader_setup_from_subsurface(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  const bool backfacing = sd->flag & SD_BACKFACING;
-
-  /* object, matrices, time, ray_length stay the same */
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->type = isect->type;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-  /* fetch triangle data */
-  if (sd->type == PRIMITIVE_TRIANGLE) {
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* static triangle */
-    sd->P = triangle_refine_local(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#  ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#  endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, true);
-  }
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#  ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#  endif
-  }
-
-  /* backfacing test */
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#  ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#  endif
-  }
-
-  /* should not get used in principle as the shading will only use a diffuse
-   * BSDF, but the shader might still access it */
-  sd->I = sd->N;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-  /* don't modify dP and dI */
-#  endif
-
-  PROFILING_SHADER(sd->shader);
-}
-#endif
-
-/* ShaderData setup from position sampled on mesh */
-
-ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
-                                                ShaderData *sd,
-                                                const float3 P,
-                                                const float3 Ng,
-                                                const float3 I,
-                                                int shader,
-                                                int object,
-                                                int prim,
-                                                float u,
-                                                float v,
-                                                float t,
-                                                float time,
-                                                bool object_space,
-                                                int lamp)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = P;
-  sd->N = Ng;
-  sd->Ng = Ng;
-  sd->I = I;
-  sd->shader = shader;
-  if (prim != PRIM_NONE)
-    sd->type = PRIMITIVE_TRIANGLE;
-  else if (lamp != LAMP_NONE)
-    sd->type = PRIMITIVE_LAMP;
-  else
-    sd->type = PRIMITIVE_NONE;
-
-  /* primitive */
-  sd->object = object;
-  sd->lamp = LAMP_NONE;
-  /* Currently no access to bvh prim index for strand sd->prim. */
-  sd->prim = prim;
-  sd->u = u;
-  sd->v = v;
-  sd->time = time;
-  sd->ray_length = t;
-
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  if (sd->object != OBJECT_NONE) {
-    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
-
-#ifdef __OBJECT_MOTION__
-    shader_setup_object_transforms(kg, sd, time);
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->ob_tfm = lamp_fetch_transform(kg, lamp, false);
-    sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
-    sd->lamp = lamp;
-#else
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->lamp = lamp;
-#endif
-  }
-
-  /* transform into world space */
-  if (object_space) {
-    object_position_transform_auto(kg, sd, &sd->P);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-    sd->N = sd->Ng;
-    object_dir_transform_auto(kg, sd, &sd->I);
-  }
-
-  if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL) {
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-        object_normal_transform_auto(kg, sd, &sd->N);
-      }
-    }
-
-    /* dPdu/dPdv */
-#ifdef __DPDU__
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-
-    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-      object_dir_transform_auto(kg, sd, &sd->dPdu);
-      object_dir_transform_auto(kg, sd, &sd->dPdv);
-    }
-#endif
-  }
-  else {
-#ifdef __DPDU__
-    sd->dPdu = zero_float3();
-    sd->dPdv = zero_float3();
-#endif
-  }
-
-  /* backfacing test */
-  if (sd->prim != PRIM_NONE) {
-    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-    if (backfacing) {
-      sd->flag |= SD_BACKFACING;
-      sd->Ng = -sd->Ng;
-      sd->N = -sd->N;
-#ifdef __DPDU__
-      sd->dPdu = -sd->dPdu;
-      sd->dPdv = -sd->dPdv;
-#endif
-    }
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* no ray differentials here yet */
-  sd->dP = differential3_zero();
-  sd->dI = differential3_zero();
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup for displacement */
-
-ccl_device void shader_setup_from_displace(
-    KernelGlobals *kg, ShaderData *sd, int object, int prim, float u, float v)
-{
-  float3 P, Ng, I = zero_float3();
-  int shader;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  /* force smooth shading for displacement */
-  shader |= SHADER_SMOOTH_NORMAL;
-
-  shader_setup_from_sample(
-      kg,
-      sd,
-      P,
-      Ng,
-      I,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      0.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-}
-
-/* ShaderData setup from ray into background */
-
-ccl_device_inline void shader_setup_from_background(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->D;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = kernel_data.background.surface_shader;
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f;
-
-  sd->object = OBJECT_NONE;
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#endif
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from point inside volume */
-
-#ifdef __VOLUME__
-ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->P;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = SHADER_NONE;
-  sd->flag = 0;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
-
-  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->type = PRIMITIVE_NONE;
-
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#  ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#  endif
-
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-  sd->ray_dP = ray->dP;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-#endif /* __VOLUME__ */
+CCL_NAMESPACE_BEGIN
 
 /* Merging */
 
-#if defined(__BRANCHED_PATH__) || defined(__VOLUME__)
-ccl_device_inline void shader_merge_closures(ShaderData *sd)
+#if defined(__VOLUME__)
+ccl_device_inline void shader_merge_volume_closures(ShaderData *sd)
 {
-  /* merge identical closures, better when we sample a single closure at a time */
+  /* Merge identical closures to save closure space with stacked volumes. */
   for (int i = 0; i < sd->num_closure; i++) {
     ShaderClosure *sci = &sd->closure[i];
 
+    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      continue;
+    }
+
     for (int j = i + 1; j < sd->num_closure; j++) {
       ShaderClosure *scj = &sd->closure[j];
-
-      if (sci->type != scj->type)
+      if (sci->type != scj->type) {
         continue;
-      if (!bsdf_merge(sci, scj))
+      }
+
+      const HenyeyGreensteinVolume *hgi = (const HenyeyGreensteinVolume *)sci;
+      const HenyeyGreensteinVolume *hgj = (const HenyeyGreensteinVolume *)scj;
+      if (!(hgi->g == hgj->g)) {
         continue;
+      }
 
       sci->weight += scj->weight;
       sci->sample_weight += scj->sample_weight;
@@ -520,16 +75,40 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
     }
   }
 }
-#endif /* __BRANCHED_PATH__ || __VOLUME__ */
 
-/* Defensive sampling. */
+ccl_device_inline void shader_copy_volume_phases(ShaderVolumePhases *ccl_restrict phases,
+                                                 const ShaderData *ccl_restrict sd)
+{
+  phases->num_closure = 0;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    const ShaderClosure *from_sc = &sd->closure[i];
+    const HenyeyGreensteinVolume *from_hg = (const HenyeyGreensteinVolume *)from_sc;
+
+    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+      to_sc->weight = from_sc->weight;
+      to_sc->sample_weight = from_sc->sample_weight;
+      to_sc->g = from_hg->g;
+      phases->num_closure++;
+      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+        break;
+      }
+    }
+  }
+}
+#endif /* __VOLUME__ */
 
-ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space PathState *state)
+ccl_device_inline void shader_prepare_surface_closures(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
-  /* We can likely also do defensive sampling at deeper bounces, particularly
+  /* Defensive sampling.
+   *
+   * We can likely also do defensive sampling at deeper bounces, particularly
    * for cases like a perfect mirror but possibly also others. This will need
    * a good heuristic. */
-  if (state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+  if (INTEGRATOR_STATE(path, bounce) + INTEGRATOR_STATE(path, transparent_bounce) == 0 &&
+      sd->num_closure > 1) {
     float sum = 0.0f;
 
     for (int i = 0; i < sd->num_closure; i++) {
@@ -546,98 +125,119 @@ ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space Pa
       }
     }
   }
+
+  /* Filter glossy.
+   *
+   * Blurring of bsdf after bounces, for rays that have a small likelihood
+   * of following this particular path (diffuse, rough glossy) */
+  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
+    float blur_pdf = kernel_data.integrator.filter_glossy * INTEGRATOR_STATE(path, min_ray_pdf);
+
+    if (blur_pdf < 1.0f) {
+      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ShaderClosure *sc = &sd->closure[i];
+        if (CLOSURE_IS_BSDF(sc->type)) {
+          bsdf_blur(kg, sc, blur_roughness);
+        }
+      }
+    }
+  }
 }
 
 /* BSDF */
 
-ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               const float3 omega_in,
-                                               float *pdf,
-                                               const ShaderClosure *skip_sc,
-                                               BsdfEval *result_eval,
-                                               float sum_pdf,
-                                               float sum_sample_weight)
+ccl_device_inline bool shader_bsdf_is_transmission(const ShaderData *sd, const float3 omega_in)
+{
+  return dot(sd->N, omega_in) < 0.0f;
+}
+
+ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
+{
+  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+    return false;
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const float3 omega_in,
+                                                const bool is_transmission,
+                                                const ShaderClosure *skip_sc,
+                                                BsdfEval *result_eval,
+                                                float sum_pdf,
+                                                float sum_sample_weight,
+                                                const uint light_shader_flags)
 {
   /* this is the veach one-sample model with balance heuristic, some pdf
    * factors drop out when using balance heuristic weighting */
   for (int i = 0; i < sd->num_closure; i++) {
     const ShaderClosure *sc = &sd->closure[i];
 
-    if (sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
+    if (sc == skip_sc) {
+      continue;
+    }
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
 
-      if (bsdf_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, 1.0f);
-        sum_pdf += bsdf_pdf * sc->sample_weight;
+        if (bsdf_pdf != 0.0f) {
+          const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                                   CLOSURE_IS_BSDF_BSSRDF(sc->type));
+          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+        }
       }
 
       sum_sample_weight += sc->sample_weight;
     }
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        const float3 omega_in,
-                                                        BsdfEval *result_eval,
-                                                        float light_pdf,
-                                                        bool use_mis)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-    if (CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
-      if (bsdf_pdf != 0.0f) {
-        float mis_weight = use_mis ? power_heuristic(light_pdf, bsdf_pdf) : 1.0f;
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, mis_weight);
-      }
-    }
-  }
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
-#endif /* __BRANCHED_PATH__ */
 
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
 ccl_device_inline
 #endif
-    void
-    shader_bsdf_eval(KernelGlobals *kg,
+    float
+    shader_bsdf_eval(const KernelGlobals *kg,
                      ShaderData *sd,
                      const float3 omega_in,
-                     BsdfEval *eval,
-                     float light_pdf,
-                     bool use_mis)
+                     const bool is_transmission,
+                     BsdfEval *bsdf_eval,
+                     const uint light_shader_flags)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
-
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
+  bsdf_eval_init(bsdf_eval, false, zero_float3());
 
-#ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched)
-    _shader_bsdf_multi_eval_branched(kg, sd, omega_in, eval, light_pdf, use_mis);
-  else
-#endif
-  {
-    float pdf;
-    _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
-    if (use_mis) {
-      float weight = power_heuristic(light_pdf, pdf);
-      bsdf_eval_mis(eval, weight);
-    }
-  }
+  return _shader_bsdf_multi_eval(
+      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
 }
 
-ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *randu)
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline const ShaderClosure *shader_bsdf_bssrdf_pick(const ShaderData *ccl_restrict sd,
+                                                               float *randu)
 {
-  /* Note the sampling here must match shader_bssrdf_pick,
-   * since we reuse the same random number. */
   int sampled = 0;
 
   if (sd->num_closure > 1) {
@@ -674,106 +274,33 @@ ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *r
     }
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSDF(sc->type) ? sc : NULL;
+  return &sd->closure[sampled];
 }
 
-ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
-                                                          ccl_addr_space float3 *throughput,
-                                                          float *randu)
+/* Return weight for picked BSSRDF. */
+ccl_device_inline float3 shader_bssrdf_sample_weight(const ShaderData *ccl_restrict sd,
+                                                     const ShaderClosure *ccl_restrict bssrdf_sc)
 {
-  /* Note the sampling here must match shader_bsdf_pick,
-   * since we reuse the same random number. */
-  int sampled = 0;
+  float3 weight = bssrdf_sc->weight;
 
   if (sd->num_closure > 1) {
-    /* Pick a BSDF or BSSRDF or based on sample weights. */
-    float sum_bsdf = 0.0f;
-    float sum_bssrdf = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF(sc->type)) {
-        sum_bsdf += sc->sample_weight;
-      }
-      else if (CLOSURE_IS_BSSRDF(sc->type)) {
-        sum_bssrdf += sc->sample_weight;
-      }
-    }
-
-    float r = (*randu) * (sum_bsdf + sum_bssrdf);
-    float partial_sum = 0.0f;
-
+    float sum = 0.0f;
     for (int i = 0; i < sd->num_closure; i++) {
       const ShaderClosure *sc = &sd->closure[i];
 
       if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r < next_sum) {
-          if (CLOSURE_IS_BSDF(sc->type)) {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
-            return NULL;
-          }
-          else {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
-            sampled = i;
-
-            /* Rescale to reuse for direction sample, to better preserve stratification. */
-            *randu = (r - partial_sum) / sc->sample_weight;
-            break;
-          }
-        }
-
-        partial_sum = next_sum;
+        sum += sc->sample_weight;
       }
     }
+    weight *= sum / bssrdf_sc->sample_weight;
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSSRDF(sc->type) ? sc : NULL;
-}
-
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float randu,
-                                         float randv,
-                                         BsdfEval *bsdf_eval,
-                                         float3 *omega_in,
-                                         differential3 *domega_in,
-                                         float *pdf)
-{
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
-
-  const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
-  if (sc == NULL) {
-    *pdf = 0.0f;
-    return LABEL_NONE;
-  }
-
-  /* BSSRDF should already have been handled elsewhere. */
-  kernel_assert(CLOSURE_IS_BSDF(sc->type));
-
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
-
-    if (sd->num_closure > 1) {
-      float sweight = sc->sample_weight;
-      _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf * sweight, sweight);
-    }
-  }
-
-  return label;
+  return weight;
 }
 
-ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int shader_bsdf_sample_closure(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           const ShaderClosure *sc,
                                           float randu,
@@ -783,7 +310,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
   int label;
   float3 eval = zero_float3();
@@ -791,19 +319,29 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
   *pdf = 0.0f;
   label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
-  if (*pdf != 0.0f)
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
+  if (*pdf != 0.0f) {
+    const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                             CLOSURE_IS_BSDF_BSSRDF(sc->type));
+    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);
+
+    if (sd->num_closure > 1) {
+      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
+      float sweight = sc->sample_weight;
+      *pdf = _shader_bsdf_multi_eval(
+          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+    }
+  }
 
   return label;
 }
 
-ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
+ccl_device float shader_bsdf_average_roughness(const ShaderData *sd)
 {
   float roughness = 0.0f;
   float sum_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF(sc->type)) {
       /* sqrt once to undo the squaring from multiplying roughness on the
@@ -817,17 +355,7 @@ ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
   return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
 }
 
-ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF(sc->type))
-      bsdf_blur(kg, sc, roughness);
-  }
-}
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return one_float3();
@@ -840,7 +368,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *
   }
 }
 
-ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device void shader_bsdf_disable_transparency(const KernelGlobals *kg, ShaderData *sd)
 {
   if (sd->flag & SD_TRANSPARENT) {
     for (int i = 0; i < sd->num_closure; i++) {
@@ -856,7 +384,7 @@ ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *
   }
 }
 
-ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_alpha(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
 
@@ -866,12 +394,12 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
   return alpha;
 }
 
-ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_diffuse(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) ||
         CLOSURE_IS_BSDF_BSSRDF(sc->type))
@@ -881,12 +409,12 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_glossy(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
       eval += sc->weight;
@@ -895,12 +423,12 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transmission(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
       eval += sc->weight;
@@ -909,12 +437,12 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_average_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
       N += sc->N * fabsf(average(sc->weight));
   }
@@ -922,59 +450,44 @@ ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
   return (is_zero(N)) ? sd->N : normalize(N);
 }
 
-ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
+ccl_device float3 shader_bsdf_ao_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
       const DiffuseBsdf *bsdf = (const DiffuseBsdf *)sc;
-      eval += sc->weight * ao_factor;
       N += bsdf->N * fabsf(average(sc->weight));
     }
   }
 
-  *N_ = (is_zero(N)) ? sd->N : normalize(N);
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 
 #ifdef __SUBSURFACE__
-ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
+ccl_device float3 shader_bssrdf_normal(const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
-  float texture_blur = 0.0f, weight_sum = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSSRDF(sc->type)) {
       const Bssrdf *bssrdf = (const Bssrdf *)sc;
       float avg_weight = fabsf(average(sc->weight));
 
       N += bssrdf->N * avg_weight;
-      eval += sc->weight;
-      texture_blur += bssrdf->texture_blur * avg_weight;
-      weight_sum += avg_weight;
     }
   }
 
-  if (N_)
-    *N_ = (is_zero(N)) ? sd->N : normalize(N);
-
-  if (texture_blur_)
-    *texture_blur_ = safe_divide(texture_blur, weight_sum);
-
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 #endif /* __SUBSURFACE__ */
 
 /* Constant emission optimization */
 
-ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, float3 *eval)
+ccl_device bool shader_constant_emission_eval(const KernelGlobals *kg, int shader, float3 *eval)
 {
   int shader_index = shader & SHADER_MASK;
   int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
@@ -992,7 +505,7 @@ ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, flo
 
 /* Background */
 
-ccl_device float3 shader_background_eval(ShaderData *sd)
+ccl_device float3 shader_background_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return sd->closure_emission_background;
@@ -1004,7 +517,7 @@ ccl_device float3 shader_background_eval(ShaderData *sd)
 
 /* Emission */
 
-ccl_device float3 shader_emissive_eval(ShaderData *sd)
+ccl_device float3 shader_emissive_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
@@ -1016,7 +529,7 @@ ccl_device float3 shader_emissive_eval(ShaderData *sd)
 
 /* Holdout */
 
-ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_holdout_apply(const KernelGlobals *kg, ShaderData *sd)
 {
   float3 weight = zero_float3();
 
@@ -1041,7 +554,7 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
   }
   else {
     for (int i = 0; i < sd->num_closure; i++) {
-      ShaderClosure *sc = &sd->closure[i];
+      const ShaderClosure *sc = &sd->closure[i];
       if (CLOSURE_IS_HOLDOUT(sc->type)) {
         weight += sc->weight;
       }
@@ -1053,14 +566,12 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg,
-                                    ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    ccl_global float *buffer,
+template<uint node_feature_mask>
+ccl_device void shader_eval_surface(INTEGRATOR_STATE_CONST_ARGS,
+                                    ShaderData *ccl_restrict sd,
+                                    ccl_global float *ccl_restrict buffer,
                                     int path_flag)
 {
-  PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
-
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
    * shader data also do not have a closure array to save GPU memory. */
@@ -1069,7 +580,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   sd->num_closure = 0;
@@ -1078,17 +589,18 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #ifdef __OSL__
   if (kg->osl) {
     if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, sd, state, path_flag);
+      OSLShader::eval_background(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else {
-      OSLShader::eval_surface(kg, sd, state, path_flag);
+      OSLShader::eval_surface(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
   }
   else
 #endif
   {
 #ifdef __SVM__
-    svm_eval_nodes(kg, sd, state, buffer, SHADER_TYPE_SURFACE, path_flag);
+    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(
+        INTEGRATOR_STATE_PASS, sd, buffer, path_flag);
 #else
     if (sd->object == OBJECT_NONE) {
       sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
@@ -1105,8 +617,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #endif
   }
 
-  if (sd->flag & SD_BSDF_NEEDS_LCG) {
-    sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
+  if (KERNEL_NODES_FEATURE(BSDF) && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+    sd->lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                   INTEGRATOR_STATE(path, rng_offset),
+                                   INTEGRATOR_STATE(path, sample),
+                                   0xb4bc3953);
   }
 }
 
@@ -1114,48 +629,47 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 
 #ifdef __VOLUME__
 
-ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd,
-                                                       const float3 omega_in,
-                                                       float *pdf,
-                                                       int skip_phase,
-                                                       BsdfEval *result_eval,
-                                                       float sum_pdf,
-                                                       float sum_sample_weight)
+ccl_device_inline float _shader_volume_phase_multi_eval(const ShaderData *sd,
+                                                        const ShaderVolumePhases *phases,
+                                                        const float3 omega_in,
+                                                        int skip_phase,
+                                                        BsdfEval *result_eval,
+                                                        float sum_pdf,
+                                                        float sum_sample_weight)
 {
-  for (int i = 0; i < sd->num_closure; i++) {
+  for (int i = 0; i < phases->num_closure; i++) {
     if (i == skip_phase)
       continue;
 
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_PHASE(sc->type)) {
-      float phase_pdf = 0.0f;
-      float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
+    const ShaderVolumeClosure *svc = &phases->closure[i];
+    float phase_pdf = 0.0f;
+    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
 
-      if (phase_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
-        sum_pdf += phase_pdf * sc->sample_weight;
-      }
-
-      sum_sample_weight += sc->sample_weight;
+    if (phase_pdf != 0.0f) {
+      bsdf_eval_accum(result_eval, false, eval, 1.0f);
+      sum_pdf += phase_pdf * svc->sample_weight;
     }
+
+    sum_sample_weight += svc->sample_weight;
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
 
-ccl_device void shader_volume_phase_eval(
-    KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf)
+ccl_device float shader_volume_phase_eval(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
+                                          const float3 omega_in,
+                                          BsdfEval *phase_eval)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+  bsdf_eval_init(phase_eval, false, zero_float3());
 
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
-
-  _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
+  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
 }
 
-ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
+ccl_device int shader_volume_phase_sample(const KernelGlobals *kg,
                                           const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
                                           float randu,
                                           float randv,
                                           BsdfEval *phase_eval,
@@ -1163,41 +677,34 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int sampled = 0;
 
-  if (sd->num_closure > 1) {
+  if (phases->num_closure > 1) {
     /* pick a phase closure based on sample weights */
     float sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
-
-      if (CLOSURE_IS_PHASE(sc->type))
-        sum += sc->sample_weight;
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      sum += svc->sample_weight;
     }
 
     float r = randu * sum;
     float partial_sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      float next_sum = partial_sum + svc->sample_weight;
 
-      if (CLOSURE_IS_PHASE(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r <= next_sum) {
-          /* Rescale to reuse for BSDF direction sample. */
-          randu = (r - partial_sum) / sc->sample_weight;
-          break;
-        }
-
-        partial_sum = next_sum;
+      if (r <= next_sum) {
+        /* Rescale to reuse for BSDF direction sample. */
+        randu = (r - partial_sum) / svc->sample_weight;
+        break;
       }
+
+      partial_sum = next_sum;
     }
 
-    if (sampled == sd->num_closure) {
+    if (sampled == phases->num_closure) {
       *pdf = 0.0f;
       return LABEL_NONE;
     }
@@ -1205,23 +712,23 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
 
   /* todo: this isn't quite correct, we don't weight anisotropy properly
    * depending on color channels, even if this is perhaps not a common case */
-  const ShaderClosure *sc = &sd->closure[sampled];
+  const ShaderVolumeClosure *svc = &phases->closure[sampled];
   int label;
   float3 eval = zero_float3();
 
   *pdf = 0.0f;
-  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
   }
 
   return label;
 }
 
-ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
+ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,
                                            const ShaderData *sd,
-                                           const ShaderClosure *sc,
+                                           const ShaderVolumeClosure *sc,
                                            float randu,
                                            float randv,
                                            BsdfEval *phase_eval,
@@ -1229,8 +736,6 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
                                            differential3 *domega_in,
                                            float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int label;
   float3 eval = zero_float3();
 
@@ -1238,18 +743,18 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
   label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
 
   return label;
 }
 
 /* Volume Evaluation */
 
-ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          ccl_addr_space PathState *state,
-                                          ccl_addr_space VolumeStack *stack,
-                                          int path_flag)
+template<typename StackReadOp>
+ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
+                                          ShaderData *ccl_restrict sd,
+                                          const int path_flag,
+                                          StackReadOp stack_read)
 {
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
@@ -1259,7 +764,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   /* reset closures once at the start, we will be accumulating the closures
@@ -1268,14 +773,18 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
   sd->num_closure_left = max_closures;
   sd->flag = 0;
   sd->object_flag = 0;
-  sd->type = PRIMITIVE_VOLUME;
 
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
+  for (int i = 0;; i++) {
+    const VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
     /* setup shaderdata from stack. it's mostly setup already in
      * shader_setup_from_volume, this switching should be quick */
-    sd->object = stack[i].object;
+    sd->object = entry.object;
     sd->lamp = LAMP_NONE;
-    sd->shader = stack[i].shader;
+    sd->shader = entry.shader;
 
     sd->flag &= ~SD_SHADER_FLAGS;
     sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
@@ -1295,18 +804,19 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 #  ifdef __SVM__
 #    ifdef __OSL__
     if (kg->osl) {
-      OSLShader::eval_volume(kg, sd, state, path_flag);
+      OSLShader::eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else
 #    endif
     {
-      svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_VOLUME, path_flag);
+      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+          INTEGRATOR_STATE_PASS, sd, NULL, path_flag);
     }
 #  endif
 
-    /* merge closures to avoid exceeding number of closures limit */
+    /* Merge closures to avoid exceeding number of closures limit. */
     if (i > 0)
-      shader_merge_closures(sd);
+      shader_merge_volume_closures(sd);
   }
 }
 
@@ -1314,9 +824,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 /* Displacement Evaluation */
 
-ccl_device void shader_eval_displacement(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         ccl_addr_space PathState *state)
+ccl_device void shader_eval_displacement(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
   sd->num_closure = 0;
   sd->num_closure_left = 0;
@@ -1325,11 +833,12 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 #ifdef __SVM__
 #  ifdef __OSL__
   if (kg->osl)
-    OSLShader::eval_displacement(kg, sd, state);
+    OSLShader::eval_displacement(INTEGRATOR_STATE_PASS, sd);
   else
 #  endif
   {
-    svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_DISPLACEMENT, 0);
+    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+        INTEGRATOR_STATE_PASS, sd, NULL, 0);
   }
 #endif
 }
@@ -1337,29 +846,13 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 /* Transparent Shadows */
 
 #ifdef __TRANSPARENT_SHADOWS__
-ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
+ccl_device bool shader_transparent_shadow(const KernelGlobals *kg, Intersection *isect)
 {
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
-  int shader = 0;
-
-#  ifdef __HAIR__
-  if (isect->type & PRIMITIVE_ALL_TRIANGLE) {
-#  endif
-    shader = kernel_tex_fetch(__tri_shader, prim);
-#  ifdef __HAIR__
-  }
-  else {
-    float4 str = kernel_tex_fetch(__curves, prim);
-    shader = __float_as_int(str.z);
-  }
-#  endif
-  int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-  return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
+  return (intersection_get_shader_flags(kg, isect) & SD_HAS_TRANSPARENT_SHADOW) != 0;
 }
 #endif /* __TRANSPARENT_SHADOWS__ */
 
-ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+ccl_device float shader_cryptomatte_id(const KernelGlobals *kg, int shader)
 {
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
 }
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
deleted file mode 100644
index 3b124122fba..00000000000
--- a/intern/cycles/kernel/kernel_shadow.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME__
-/* Get PathState ready for use for volume stack evaluation. */
-#  ifdef __SPLIT_KERNEL__
-ccl_addr_space
-#  endif
-    ccl_device_inline PathState *
-    shadow_blocked_volume_path_state(KernelGlobals *kg,
-                                     VolumeState *volume_state,
-                                     ccl_addr_space PathState *state,
-                                     ShaderData *sd,
-                                     Ray *ray)
-{
-#  ifdef __SPLIT_KERNEL__
-  ccl_addr_space PathState *ps =
-      &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-#  else
-  PathState *ps = &volume_state->ps;
-#  endif
-  *ps = *state;
-  /* We are checking for shadow on the "other" side of the surface, so need
-   * to discard volume we are currently at.
-   */
-  if (dot(sd->Ng, ray->D) < 0.0f) {
-    kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
-  }
-  return ps;
-}
-#endif /* __VOLUME__ */
-
-/* Attenuate throughput accordingly to the given intersection event.
- * Returns true if the throughput is zero and traversal can be aborted.
- */
-ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg,
-                                                            ShaderData *shadow_sd,
-                                                            ccl_addr_space PathState *state,
-#ifdef __VOLUME__
-                                                            ccl_addr_space PathState *volume_state,
-#endif
-                                                            Intersection *isect,
-                                                            Ray *ray,
-                                                            float3 *throughput)
-{
-#ifdef __VOLUME__
-  /* Attenuation between last surface and next surface. */
-  if (volume_state->volume_stack[0].shader != SHADER_NONE) {
-    Ray segment_ray = *ray;
-    segment_ray.t = isect->t;
-    kernel_volume_shadow(kg, shadow_sd, volume_state, &segment_ray, throughput);
-  }
-#endif
-  /* Setup shader data at surface. */
-  shader_setup_from_ray(kg, shadow_sd, isect, ray);
-  /* Attenuation from transparent surface. */
-  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, shadow_sd, state, NULL, PATH_RAY_SHADOW);
-    path_state_modify_bounce(state, false);
-    *throughput *= shader_bsdf_transparency(kg, shadow_sd);
-  }
-  /* Stop if all light is blocked. */
-  if (is_zero(*throughput)) {
-    return true;
-  }
-#ifdef __VOLUME__
-  /* Exit/enter volume. */
-  kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
-#endif
-  return false;
-}
-
-/* Special version which only handles opaque shadows. */
-ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      const uint visibility,
-                                      Ray *ray,
-                                      Intersection *isect,
-                                      float3 *shadow)
-{
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-#ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-    kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
-  }
-#endif
-  return blocked;
-}
-
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-/* Shadow function to compute how much light is blocked,
- *
- * We trace a single ray. If it hits any opaque surface, or more than a given
- * number of transparent surfaces is hit, then we consider the geometry to be
- * entirely blocked. If not, all transparent surfaces will be recorded and we
- * will shade them one by one to determine how much light is blocked. This all
- * happens in one scene intersection function.
- *
- * Recording all hits works well in some cases but may be slower in others. If
- * we have many semi-transparent hairs, one intersection may be faster because
- * you'd be reinteresecting the same hairs a lot with each step otherwise. If
- * however there is mostly binary transparency then we may be recording many
- * unnecessary intersections when one of the first surfaces blocks all light.
- *
- * From tests in real scenes it seems the performance loss is either minimal,
- * or there is a performance increase anyway due to avoiding the need to send
- * two rays with transparent shadows.
- *
- * On CPU it'll handle all transparent bounces (by allocating storage for
- * intersections when they don't fit into the stack storage).
- *
- * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
- * is something to be kept an eye on.
- */
-
-#    define SHADOW_STACK_MAX_HITS 64
-
-/* Actual logic with traversal loop implementation which is free from device
- * specific tweaks.
- *
- * Note that hits array should be as big as max_hits+1.
- */
-ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    ShaderData *shadow_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    const uint visibility,
-                                                    Ray *ray,
-                                                    Intersection *hits,
-                                                    uint max_hits,
-                                                    float3 *shadow)
-{
-  /* Intersect to find an opaque surface, or record all transparent
-   * surface hits.
-   */
-  uint num_hits;
-  const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits);
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  /* If no opaque surface found but we did find transparent hits,
-   * shade them.
-   */
-  if (!blocked && num_hits > 0) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    float last_t = 0.0f;
-    int bounce = state->transparent_bounce;
-    Intersection *isect = hits;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    sort_intersections(hits, num_hits);
-    for (int hit = 0; hit < num_hits; hit++, isect++) {
-      /* Adjust intersection distance for moving ray forward. */
-      float new_t = isect->t;
-      isect->t -= last_t;
-      /* Skip hit if we did not move forward, step by step raytracing
-       * would have skipped it as well then.
-       */
-      if (last_t == new_t) {
-        continue;
-      }
-      last_t = new_t;
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = shadow_sd->P;
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow = throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-/* Here we do all device specific trickery before invoking actual traversal
- * loop to help readability of the actual logic.
- */
-ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *shadow_sd,
-                                               ccl_addr_space PathState *state,
-                                               const uint visibility,
-                                               Ray *ray,
-                                               uint max_hits,
-                                               float3 *shadow)
-{
-#    ifdef __SPLIT_KERNEL__
-  Intersection hits_[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = &hits_[0];
-#    elif defined(__KERNEL_CUDA__)
-  Intersection *hits = kg->hits_stack;
-#    else
-  Intersection hits_stack[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = hits_stack;
-#    endif
-#    ifndef __KERNEL_GPU__
-  /* Prefer to use stack but use dynamic allocation if too deep max hits
-   * we need max_hits + 1 storage space due to the logic in
-   * scene_intersect_shadow_all which will first store and then check if
-   * the limit is exceeded.
-   *
-   * Ignore this on GPU because of slow/unavailable malloc().
-   */
-  if (max_hits + 1 > SHADOW_STACK_MAX_HITS) {
-    if (kg->transparent_shadow_intersections == NULL) {
-      const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-      kg->transparent_shadow_intersections = (Intersection *)malloc(sizeof(Intersection) *
-                                                                    (transparent_max_bounce + 1));
-    }
-    hits = kg->transparent_shadow_intersections;
-  }
-#    endif /* __KERNEL_GPU__ */
-  /* Invoke actual traversal. */
-  return shadow_blocked_transparent_all_loop(
-      kg, sd, shadow_sd, state, visibility, ray, hits, max_hits, shadow);
-}
-#  endif /* __SHADOW_RECORD_ALL__ */
-
-#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
-/* Shadow function to compute how much light is blocked,
- *
- * Here we raytrace from one transparent surface to the next step by step.
- * To minimize overhead in cases where we don't need transparent shadows, we
- * first trace a regular shadow ray. We check if the hit primitive was
- * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency.
- */
-
-/* This function is only implementing device-independent traversal logic
- * which requires some precalculation done.
- */
-ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *shadow_sd,
-                                                        ccl_addr_space PathState *state,
-                                                        const uint visibility,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        const bool blocked,
-                                                        const bool is_transparent_isect,
-                                                        float3 *shadow)
-{
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  if (blocked && is_transparent_isect) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    int bounce = state->transparent_bounce;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    for (;;) {
-      if (bounce >= kernel_data.integrator.transparent_max_bounce) {
-        return true;
-      }
-      if (!scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect)) {
-        break;
-      }
-      if (!shader_transparent_shadow(kg, isect)) {
-        return true;
-      }
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow *= throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-ccl_device bool shadow_blocked_transparent_stepped(KernelGlobals *kg,
-                                                   ShaderData *sd,
-                                                   ShaderData *shadow_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   const uint visibility,
-                                                   Ray *ray,
-                                                   Intersection *isect,
-                                                   float3 *shadow)
-{
-  bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-  bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, isect) : false;
-  return shadow_blocked_transparent_stepped_loop(
-      kg, sd, shadow_sd, state, visibility, ray, isect, blocked, is_transparent_isect, shadow);
-}
-
-#  endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      Ray *ray,
-                                      float3 *shadow)
-{
-  *shadow = one_float3();
-#if !defined(__KERNEL_OPTIX__)
-  /* Some common early checks.
-   * Avoid conditional trace call in OptiX though, since those hurt performance there.
-   */
-  if (ray->t == 0.0f) {
-    return false;
-  }
-#endif
-#ifdef __SHADOW_TRICKS__
-  const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER :
-                                                                    PATH_RAY_SHADOW;
-#else
-  const uint visibility = PATH_RAY_SHADOW;
-#endif
-  /* Do actual shadow shading.
-   * First of all, we check if integrator requires transparent shadows.
-   * if not, we use simplest and fastest ever way to calculate occlusion.
-   * Do not do this in OptiX to avoid the additional trace call.
-   */
-#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__)
-  Intersection isect;
-#  ifdef __TRANSPARENT_SHADOWS__
-  if (!kernel_data.integrator.transparent_shadows)
-#  endif
-  {
-    return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow);
-  }
-#endif
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-  /* For the transparent shadows we try to use record-all logic on the
-   * devices which supports this.
-   */
-  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-  /* Check transparent bounces here, for volume scatter which can do
-   * lighting before surface path termination is checked.
-   */
-  if (state->transparent_bounce >= transparent_max_bounce) {
-    return true;
-  }
-  uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-#    if defined(__KERNEL_OPTIX__)
-  /* Always use record-all behavior in OptiX, but ensure there are no out of bounds
-   * accesses to the hit stack.
-   */
-  max_hits = min(max_hits, SHADOW_STACK_MAX_HITS - 1);
-#    elif defined(__KERNEL_GPU__)
-  /* On GPU we do tricky with tracing opaque ray first, this avoids speed
-   * regressions in some files.
-   *
-   * TODO(sergey): Check why using record-all behavior causes slowdown in such
-   * cases. Could that be caused by a higher spill pressure?
-   */
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect);
-  const bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, &isect) : false;
-  if (!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) {
-    return shadow_blocked_transparent_stepped_loop(
-        kg, sd, shadow_sd, state, visibility, ray, &isect, blocked, is_transparent_isect, shadow);
-  }
-#    endif /* __KERNEL_GPU__ */
-  return shadow_blocked_transparent_all(
-      kg, sd, shadow_sd, state, visibility, ray, max_hits, shadow);
-#  else  /* __SHADOW_RECORD_ALL__ */
-  /* Fallback to a slowest version which works on all devices. */
-  return shadow_blocked_transparent_stepped(
-      kg, sd, shadow_sd, state, visibility, ray, &isect, shadow);
-#  endif /* __SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-}
-
-#undef SHADOW_STACK_MAX_HITS
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shadow_catcher.h b/intern/cycles/kernel/kernel_shadow_catcher.h
new file mode 100644
index 00000000000..824749818a4
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shadow_catcher.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state_util.h"
+#include "kernel/kernel_path_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Check whether current surface bounce is where path is to be split for the shadow catcher. */
+ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_ARGS,
+                                                                  const int object_flag)
+{
+#ifdef __SHADOW_CATCHER__
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
+
+  /* Check the flag first, avoiding fetches form global memory. */
+  if ((object_flag & SD_OBJECT_SHADOW_CATCHER) == 0) {
+    return false;
+  }
+  if (object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if ((path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) == 0) {
+    /* Split only on primary rays, secondary bounces are to treat shadow catcher as a regular
+     * object. */
+    return false;
+  }
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    return false;
+  }
+
+  return true;
+#else
+  (void)object_flag;
+  return false;
+#endif
+}
+
+/* Check whether the current path can still split. */
+ccl_device_inline bool kernel_shadow_catcher_path_can_split(INTEGRATOR_STATE_CONST_ARGS)
+{
+  if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+    /* Shadow catcher was already hit and the state was split. No further split is allowed. */
+    return false;
+  }
+
+  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
+}
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(INTEGRATOR_STATE_ARGS, const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, object_flags)) {
+    return false;
+  }
+
+  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+   * shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Split new state from the current one. This new state will only track contribution of shadow
+   * catcher objects ignoring non-catcher objects. */
+  integrator_state_shadow_catcher_split(INTEGRATOR_STATE_PASS);
+
+  return true;
+#else
+  (void)object_flags;
+  return false;
+#endif
+}
+
+#ifdef __SHADOW_CATCHER__
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_HIT) == 0;
+}
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_PASS;
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
deleted file mode 100644
index 677504a4045..00000000000
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* BSSRDF using disk based importance sampling.
- *
- * BSSRDF Importance Sampling, SIGGRAPH 2013
- * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
- */
-
-ccl_device_inline float3
-subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, float r, bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  float3 eval_sum = zero_float3();
-  float pdf_sum = 0.0f;
-  float sample_weight_inv = 0.0f;
-
-  if (!all) {
-    float sample_weight_sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-        sample_weight_sum += sc->sample_weight;
-      }
-    }
-
-    sample_weight_inv = 1.0f / sample_weight_sum;
-  }
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    sc = &sd->closure[i];
-
-    if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-      /* in case of branched path integrate we sample all bssrdf's once,
-       * for path trace we pick one, so adjust pdf for that */
-      float sample_weight = (all) ? 1.0f : sc->sample_weight * sample_weight_inv;
-
-      /* compute pdf */
-      float3 eval = bssrdf_eval(sc, r);
-      float pdf = bssrdf_pdf(sc, disk_r);
-
-      eval_sum += sc->weight * eval;
-      pdf_sum += sample_weight * pdf;
-    }
-  }
-
-  return (pdf_sum > 0.0f) ? eval_sum / pdf_sum : zero_float3();
-}
-
-ccl_device_inline float3 subsurface_scatter_walk_eval(ShaderData *sd,
-                                                      const ShaderClosure *sc,
-                                                      float3 throughput,
-                                                      bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  if (!all) {
-    float bssrdf_weight = 0.0f;
-    float weight = sc->sample_weight;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSSRDF(sc->type)) {
-        bssrdf_weight += sc->sample_weight;
-      }
-    }
-    throughput *= bssrdf_weight / weight;
-  }
-  return throughput;
-}
-
-/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(
-    KernelGlobals *kg, ShaderData *sd, ClosureType type, float roughness, float3 weight, float3 N)
-{
-  sd->flag &= ~SD_CLOSURE_FLAGS;
-  sd->num_closure = 0;
-  sd->num_closure_left = kernel_data.integrator.max_closures;
-
-#ifdef __PRINCIPLED__
-  if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
-        sd, sizeof(PrincipledDiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      bsdf->roughness = roughness;
-      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular Disney principled diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
-    }
-  }
-  else if (CLOSURE_IS_BSDF_BSSRDF(type) || CLOSURE_IS_BSSRDF(type))
-#endif /* __PRINCIPLED__ */
-  {
-    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      sd->flag |= bsdf_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
-    }
-  }
-}
-
-/* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */
-ccl_device float3 subsurface_color_pow(float3 color, float exponent)
-{
-  color = max(color, zero_float3());
-
-  if (exponent == 1.0f) {
-    /* nothing to do */
-  }
-  else if (exponent == 0.5f) {
-    color.x = sqrtf(color.x);
-    color.y = sqrtf(color.y);
-    color.z = sqrtf(color.z);
-  }
-  else {
-    color.x = powf(color.x, exponent);
-    color.y = powf(color.y, exponent);
-    color.z = powf(color.z, exponent);
-  }
-
-  return color;
-}
-
-ccl_device void subsurface_color_bump_blur(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float3 *eval, float3 *N)
-{
-  /* average color and texture blur at outgoing point */
-  float texture_blur;
-  float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur);
-
-  /* do we have bump mapping? */
-  bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
-
-  if (bump || texture_blur > 0.0f) {
-    /* average color and normal at incoming point */
-    shader_eval_surface(kg, sd, state, NULL, state->flag);
-    float3 in_color = shader_bssrdf_sum(sd, (bump) ? N : NULL, NULL);
-
-    /* we simply divide out the average color and multiply with the average
-     * of the other one. we could try to do this per closure but it's quite
-     * tricky to match closures between shader evaluations, their number and
-     * order may change, this is simpler */
-    if (texture_blur > 0.0f) {
-      out_color = subsurface_color_pow(out_color, texture_blur);
-      in_color = subsurface_color_pow(in_color, texture_blur);
-
-      *eval *= safe_divide_color(in_color, out_color);
-    }
-  }
-}
-
-/* Subsurface scattering step, from a point on the surface to other
- * nearby points on the same object.
- */
-ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
-                                              LocalIntersection *ss_isect,
-                                              ShaderData *sd,
-                                              const ShaderClosure *sc,
-                                              uint *lcg_state,
-                                              float disk_u,
-                                              float disk_v,
-                                              bool all)
-{
-  /* pick random axis in local frame and point on disk */
-  float3 disk_N, disk_T, disk_B;
-  float pick_pdf_N, pick_pdf_T, pick_pdf_B;
-
-  disk_N = sd->Ng;
-  make_orthonormals(disk_N, &disk_T, &disk_B);
-
-  if (disk_v < 0.5f) {
-    pick_pdf_N = 0.5f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.25f;
-    disk_v *= 2.0f;
-  }
-  else if (disk_v < 0.75f) {
-    float3 tmp = disk_N;
-    disk_N = disk_T;
-    disk_T = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.5f;
-    pick_pdf_B = 0.25f;
-    disk_v = (disk_v - 0.5f) * 4.0f;
-  }
-  else {
-    float3 tmp = disk_N;
-    disk_N = disk_B;
-    disk_B = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.5f;
-    disk_v = (disk_v - 0.75f) * 4.0f;
-  }
-
-  /* sample point on disk */
-  float phi = M_2PI_F * disk_v;
-  float disk_height, disk_r;
-
-  bssrdf_sample(sc, disk_u, &disk_r, &disk_height);
-
-  float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
-
-  /* create ray */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = sd->P + disk_N * disk_height + disk_P;
-  ray->D = -disk_N;
-  ray->t = 2.0f * disk_height;
-  ray->dP = sd->dP;
-  ray->dD = differential3_zero();
-  ray->time = sd->time;
-
-  /* intersect with the same object. if multiple intersections are found it
-   * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
-  scene_intersect_local(kg, ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
-  int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
-
-  for (int hit = 0; hit < num_eval_hits; hit++) {
-    /* Quickly retrieve P and Ng without setting up ShaderData. */
-    float3 hit_P;
-    if (sd->type & PRIMITIVE_TRIANGLE) {
-      hit_P = triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray);
-    }
-#ifdef __OBJECT_MOTION__
-    else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
-      float3 verts[3];
-      motion_triangle_vertices(kg,
-                               sd->object,
-                               kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-                               sd->time,
-                               verts);
-      hit_P = motion_triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray, verts);
-    }
-#endif /* __OBJECT_MOTION__ */
-    else {
-      ss_isect->weight[hit] = zero_float3();
-      continue;
-    }
-
-    float3 hit_Ng = ss_isect->Ng[hit];
-    if (ss_isect->hits[hit].object != OBJECT_NONE) {
-      object_normal_transform(kg, sd, &hit_Ng);
-    }
-
-    /* Probability densities for local frame axes. */
-    float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
-    float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
-    float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
-
-    /* Multiple importance sample between 3 axes, power heuristic
-     * found to be slightly better than balance heuristic. pdf_N
-     * in the MIS weight and denominator cancelled out. */
-    float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
-    if (ss_isect->num_hits > BSSRDF_MAX_HITS) {
-      w *= ss_isect->num_hits / (float)BSSRDF_MAX_HITS;
-    }
-
-    /* Real distance to sampled point. */
-    float r = len(hit_P - sd->P);
-
-    /* Evaluate profiles. */
-    float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
-
-    ss_isect->weight[hit] = eval;
-  }
-
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return num_eval_hits;
-}
-
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg,
-                                                      LocalIntersection *ss_isect,
-                                                      int hit,
-                                                      ShaderData *sd,
-                                                      ccl_addr_space PathState *state,
-                                                      ClosureType type,
-                                                      float roughness)
-{
-  optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness);
-}
-extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup(
-#else
-ccl_device_noinline void subsurface_scatter_multi_setup(
-#endif
-    KernelGlobals *kg,
-    LocalIntersection *ss_isect,
-    int hit,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ClosureType type,
-    float roughness)
-{
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-
-  /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
-#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
-  kernel_split_params.dummy_sd_flag = sd->flag;
-#endif
-
-  /* Setup new shading point. */
-  shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
-
-  /* Optionally blur colors and bump mapping. */
-  float3 weight = ss_isect->weight[hit];
-  float3 N = sd->N;
-  subsurface_color_bump_blur(kg, sd, state, &weight, &N);
-
-  /* Setup diffuse BSDF. */
-  subsurface_scatter_setup_diffuse_bsdf(kg, sd, type, roughness, weight, N);
-}
-
-/* Random walk subsurface scattering.
- *
- * "Practical and Controllable Subsurface Scattering for Production Path
- *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-
-ccl_device void subsurface_random_walk_remap(const float A,
-                                             const float d,
-                                             float *sigma_t,
-                                             float *alpha)
-{
-  /* Compute attenuation and scattering coefficients from albedo. */
-  *alpha = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f)));
-  const float s = 1.9f - A + 3.5f * sqr(A - 0.8f);
-
-  *sigma_t = 1.0f / fmaxf(d * s, 1e-16f);
-}
-
-ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc,
-                                                    float3 *sigma_t,
-                                                    float3 *alpha,
-                                                    float3 *weight)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  const float3 A = bssrdf->albedo;
-  const float3 d = bssrdf->radius;
-  float sigma_t_x, sigma_t_y, sigma_t_z;
-  float alpha_x, alpha_y, alpha_z;
-
-  subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &alpha_x);
-  subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &alpha_y);
-  subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &alpha_z);
-
-  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
-  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
-
-  /* Closure mixing and Fresnel weights separate from albedo. */
-  *weight = safe_divide_color(bssrdf->weight, A);
-}
-
-/* References for Dwivedi sampling:
- *
- * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
- * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
- * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
- *
- * [2] "Improving the Dwivedi Sampling Scheme"
- * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
- * https://cg.ivd.kit.edu/1951.php
- *
- * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
- * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
- * https://iliyan.com/publications/RenderingCourse2020
- */
-
-ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
-{
-  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
-  return 1.0f / ((v - cos_theta) * phase_log);
-}
-
-ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
-{
-  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
-   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
-   * we can implement the power function like this. */
-  return v - (v + 1) * expf(-rand * phase_log);
-}
-
-ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
-{
-  /* Eq. 67 from [3] */
-  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
-}
-
-ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
-{
-  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
-  float phi = M_2PI_F * randv;
-  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
-
-  float3 T, B;
-  make_orthonormals(D, &T, &B);
-  return dir.x * T + dir.y * B + dir.z * D;
-}
-
-ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
-                                                         float t,
-                                                         bool hit,
-                                                         float3 *transmittance)
-{
-  float3 T = volume_color_transmittance(sigma_t, t);
-  if (transmittance) {
-    *transmittance = T;
-  }
-  return hit ? T : sigma_t * T;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    bool
-    subsurface_random_walk(KernelGlobals *kg,
-                           LocalIntersection *ss_isect,
-                           ShaderData *sd,
-                           ccl_addr_space PathState *state,
-                           const ShaderClosure *sc,
-                           const float bssrdf_u,
-                           const float bssrdf_v,
-                           bool all)
-{
-  /* Sample diffuse surface scatter into the object. */
-  float3 D;
-  float pdf;
-  sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf);
-  if (dot(-sd->Ng, D) <= 0.0f) {
-    return 0;
-  }
-
-  /* Convert subsurface to volume coefficients.
-   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
-  float3 sigma_t, alpha;
-  float3 throughput = one_float3();
-  subsurface_random_walk_coefficients(sc, &sigma_t, &alpha, &throughput);
-  float3 sigma_s = sigma_t * alpha;
-
-  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
-   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
-   * for making the code significantly more complex and slower (if direction sampling depends on
-   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
-   *
-   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
-   * is too low results in fireflies while one that's too high just gives a bit more noise.
-   * Therefore, the code here uses the highest of the three albedos to be safe. */
-  float diffusion_length = diffusion_length_dwivedi(max3(alpha));
-  /* Precompute term for phase sampling. */
-  float phase_log = logf((diffusion_length + 1) / (diffusion_length - 1));
-
-  /* Setup ray. */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->D = D;
-  ray->t = FLT_MAX;
-  ray->time = sd->time;
-
-  /* Modify state for RNGs, decorrelated from other paths. */
-  uint prev_rng_offset = state->rng_offset;
-  uint prev_rng_hash = state->rng_hash;
-  state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef);
-
-  /* Random walk until we hit the surface again. */
-  bool hit = false;
-  bool have_opposite_interface = false;
-  float opposite_distance = 0.0f;
-
-  /* Todo: Disable for alpha>0.999 or so? */
-  const float guided_fraction = 0.75f;
-
-  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
-    /* Advance random number offset. */
-    state->rng_offset += PRNG_BOUNCE_NUM;
-
-    /* Sample color channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
-    float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-    float randt = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    /* We need the result of the raycast to compute the full guided PDF, so just remember the
-     * relevant terms to avoid recomputing them later. */
-    float backward_fraction = 0.0f;
-    float forward_pdf_factor = 0.0f;
-    float forward_stretching = 1.0f;
-    float backward_pdf_factor = 0.0f;
-    float backward_stretching = 1.0f;
-
-    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
-    if (bounce > 0) {
-      /* Decide whether we should use guided or classic sampling. */
-      bool guided = (path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE) < guided_fraction);
-
-      /* Determine if we want to sample away from the incoming interface.
-       * This only happens if we found a nearby opposite interface, and the probability for it
-       * depends on how close we are to it already.
-       * This probability term comes from the recorded presentation of [3]. */
-      bool guide_backward = false;
-      if (have_opposite_interface) {
-        /* Compute distance of the random walk between the tangent plane at the starting point
-         * and the assumed opposite interface (the parallel plane that contains the point we
-         * found in our ray query for the opposite side). */
-        float x = clamp(dot(ray->P - sd->P, -sd->N), 0.0f, opposite_distance);
-        backward_fraction = 1.0f / (1.0f + expf((opposite_distance - 2 * x) / diffusion_length));
-        guide_backward = path_state_rng_1D(kg, state, PRNG_TERMINATE) < backward_fraction;
-      }
-
-      /* Sample scattering direction. */
-      float scatter_u, scatter_v;
-      path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v);
-      float cos_theta;
-      if (guided) {
-        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
-        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
-         * sign here is enough to sample from that instead. */
-        if (guide_backward) {
-          cos_theta = -cos_theta;
-        }
-      }
-      else {
-        cos_theta = 2.0f * scatter_u - 1.0f;
-      }
-      ray->D = direction_from_cosine(sd->N, cos_theta, scatter_v);
-
-      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
-       * Since phase sampling is channel-independent, we can get away with applying a factor
-       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
-       * it cancel with an equivalent term in the numerator of the full estimator.
-       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
-       */
-      forward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta);
-      backward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta);
-
-      /* Prepare distance sampling.
-       * For the backwards case, this also needs the sign swapped since now directions against
-       * sd->N (and therefore with negative cos_theta) are preferred. */
-      forward_stretching = (1.0f - cos_theta / diffusion_length);
-      backward_stretching = (1.0f + cos_theta / diffusion_length);
-      if (guided) {
-        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
-      }
-    }
-
-    /* Sample direction along ray. */
-    float t = -logf(1.0f - randt) / sample_sigma_t;
-
-    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
-     * If yes, we will later use backwards guided sampling in order to have a decent
-     * chance of connecting to it.
-     * Todo: Maybe use less than 10 times the mean free path? */
-    ray->t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
-    scene_intersect_local(kg, ray, ss_isect, sd->object, NULL, 1);
-    hit = (ss_isect->num_hits > 0);
-
-    if (hit) {
-#ifdef __KERNEL_OPTIX__
-      /* t is always in world space with OptiX. */
-      ray->t = ss_isect->hits[0].t;
-#else
-      /* Compute world space distance to surface hit. */
-      float3 D = ray->D;
-      object_inverse_dir_transform(kg, sd, &D);
-      D = normalize(D) * ss_isect->hits[0].t;
-      object_dir_transform(kg, sd, &D);
-      ray->t = len(D);
-#endif
-    }
-
-    if (bounce == 0) {
-      /* Check if we hit the opposite side. */
-      if (hit) {
-        have_opposite_interface = true;
-        opposite_distance = dot(ray->P + ray->t * ray->D - sd->P, -sd->N);
-      }
-      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
-       * so check if there would have been a hit in that case. */
-      hit = ray->t < t;
-    }
-
-    /* Use the distance to the exit point for the throughput update if we found one. */
-    if (hit) {
-      t = ray->t;
-    }
-    else if (bounce == 0) {
-      /* Restore original position if nothing was hit after the first bounce,
-       * without the ray_offset() that was added to avoid self-intersection.
-       * Otherwise if that offset is relatively large compared to the scattering
-       * radius, we never go back up high enough to exit the surface. */
-      ray->P = sd->P;
-    }
-
-    /* Advance to new scatter location. */
-    ray->P += t * ray->D;
-
-    float3 transmittance;
-    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
-    if (bounce > 0) {
-      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
-      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
-
-      if (have_opposite_interface) {
-        /* First step of MIS: Depending on geometry we might have two methods for guided
-         * sampling, so perform MIS between them. */
-        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
-        guided_pdf = mix(
-            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
-      }
-      else {
-        /* Just include phase sampling factor otherwise. */
-        guided_pdf *= forward_pdf_factor;
-      }
-
-      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
-      pdf = mix(pdf, guided_pdf, guided_fraction);
-    }
-
-    /* Finally, we're applying MIS again to combine the three color channels.
-     * Altogether, the MIS computation combines up to nine different estimators:
-     * {classic, guided, backward_guided} x {r, g, b} */
-    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
-
-    if (hit) {
-      /* If we hit the surface, we are done. */
-      break;
-    }
-    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
-      /* Avoid unnecessary work and precision issue when throughput gets really small. */
-      break;
-    }
-  }
-
-  kernel_assert(isfinite_safe(throughput.x) && isfinite_safe(throughput.y) &&
-                isfinite_safe(throughput.z));
-
-  state->rng_offset = prev_rng_offset;
-  state->rng_hash = prev_rng_hash;
-
-  /* Return number of hits in ss_isect. */
-  if (!hit) {
-    return 0;
-  }
-
-  /* TODO: gain back performance lost from merging with disk BSSRDF. We
-   * only need to return on hit so this indirect ray push/pop overhead
-   * is not actually needed, but it does keep the code simpler. */
-  ss_isect->weight[0] = subsurface_scatter_walk_eval(sd, sc, throughput, all);
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return 1;
-}
-
-ccl_device_inline int subsurface_scatter_multi_intersect(KernelGlobals *kg,
-                                                         LocalIntersection *ss_isect,
-                                                         ShaderData *sd,
-                                                         ccl_addr_space PathState *state,
-                                                         const ShaderClosure *sc,
-                                                         uint *lcg_state,
-                                                         float bssrdf_u,
-                                                         float bssrdf_v,
-                                                         bool all)
-{
-  if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-    return subsurface_scatter_disk(kg, ss_isect, sd, sc, lcg_state, bssrdf_u, bssrdf_v, all);
-  }
-  else {
-    return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v, all);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index c8e01677d09..bf9b94c1753 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -78,7 +78,7 @@ KERNEL_TEX(KernelShader, __shaders)
 KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, __sample_pattern_lut)
+KERNEL_TEX(float, __sample_pattern_lut)
 
 /* image textures */
 KERNEL_TEX(TextureInfo, __texture_info)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 7cbe18acf28..927e60e8729 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_TYPES_H__
-#define __KERNEL_TYPES_H__
+#pragma once
 
 #if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
 #  include <embree3/rtcore.h>
@@ -60,27 +59,9 @@ CCL_NAMESPACE_BEGIN
 #define PRIM_NONE (~0)
 #define LAMP_NONE (~0)
 #define ID_NONE (0.0f)
+#define PASS_UNUSED (~0)
 
-#define VOLUME_STACK_SIZE 32
-
-/* Split kernel constants */
-#define WORK_POOL_SIZE_GPU 64
-#define WORK_POOL_SIZE_CPU 1
-#ifdef __KERNEL_GPU__
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_GPU
-#else
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
-#endif
-
-#define SHADER_SORT_BLOCK_SIZE 2048
-
-#ifdef __KERNEL_OPENCL__
-#  define SHADER_SORT_LOCAL_SIZE 64
-#elif defined(__KERNEL_CUDA__)
-#  define SHADER_SORT_LOCAL_SIZE 32
-#else
-#  define SHADER_SORT_LOCAL_SIZE 1
-#endif
+#define VOLUME_STACK_SIZE 4
 
 /* Kernel features */
 #define __SOBOL__
@@ -93,7 +74,7 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
-#define __SHADOW_TRICKS__
+#define __SHADOW_CATCHER__
 #define __DENOISING_FEATURES__
 #define __SHADER_RAYTRACE__
 #define __AO__
@@ -102,7 +83,6 @@ CCL_NAMESPACE_BEGIN
 #define __SVM__
 #define __EMISSION__
 #define __HOLDOUT__
-#define __MULTI_CLOSURE__
 #define __TRANSPARENT_SHADOWS__
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
@@ -112,7 +92,6 @@ CCL_NAMESPACE_BEGIN
 #define __PRINCIPLED__
 #define __SUBSURFACE__
 #define __VOLUME__
-#define __VOLUME_SCATTER__
 #define __CMJ__
 #define __SHADOW_RECORD_ALL__
 #define __BRANCHED_PATH__
@@ -122,106 +101,60 @@ CCL_NAMESPACE_BEGIN
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_RECORD_ALL__
 #endif /* __KERNEL_CPU__ */
 
-#ifdef __KERNEL_CUDA__
-#  ifdef __SPLIT_KERNEL__
-#    undef __BRANCHED_PATH__
-#  endif
-#endif /* __KERNEL_CUDA__ */
-
 #ifdef __KERNEL_OPTIX__
 #  undef __BAKING__
-#  undef __BRANCHED_PATH__
 #endif /* __KERNEL_OPTIX__ */
 
-#ifdef __KERNEL_OPENCL__
-#endif /* __KERNEL_OPENCL__ */
-
 /* Scene-based selective features compilation. */
-#ifdef __NO_CAMERA_MOTION__
-#  undef __CAMERA_MOTION__
-#endif
-#ifdef __NO_OBJECT_MOTION__
-#  undef __OBJECT_MOTION__
-#endif
-#ifdef __NO_HAIR__
-#  undef __HAIR__
-#endif
-#ifdef __NO_VOLUME__
-#  undef __VOLUME__
-#  undef __VOLUME_SCATTER__
-#endif
-#ifdef __NO_SUBSURFACE__
-#  undef __SUBSURFACE__
-#endif
-#ifdef __NO_BAKING__
-#  undef __BAKING__
-#endif
-#ifdef __NO_BRANCHED_PATH__
-#  undef __BRANCHED_PATH__
-#endif
-#ifdef __NO_PATCH_EVAL__
-#  undef __PATCH_EVAL__
-#endif
-#ifdef __NO_TRANSPARENT__
-#  undef __TRANSPARENT_SHADOWS__
-#endif
-#ifdef __NO_SHADOW_TRICKS__
-#  undef __SHADOW_TRICKS__
-#endif
-#ifdef __NO_PRINCIPLED__
-#  undef __PRINCIPLED__
-#endif
-#ifdef __NO_DENOISING__
-#  undef __DENOISING_FEATURES__
-#endif
-#ifdef __NO_SHADER_RAYTRACE__
-#  undef __SHADER_RAYTRACE__
+#ifdef __KERNEL_FEATURES__
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_CAMERA_MOTION)
+#    undef __CAMERA_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_OBJECT_MOTION)
+#    undef __OBJECT_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_HAIR)
+#    undef __HAIR__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_VOLUME)
+#    undef __VOLUME__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE)
+#    undef __SUBSURFACE__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING)
+#    undef __BAKING__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION)
+#    undef __PATCH_EVAL__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_TRANSPARENT)
+#    undef __TRANSPARENT_SHADOWS__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SHADOW_CATCHER)
+#    undef __SHADOW_CATCHER__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PRINCIPLED)
+#    undef __PRINCIPLED__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_DENOISING)
+#    undef __DENOISING_FEATURES__
+#  endif
 #endif
 
 #ifdef WITH_CYCLES_DEBUG_NAN
 #  define __KERNEL_DEBUG_NAN__
 #endif
 
+/* Features that enable others */
+
 #if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__)
 #  define __BVH_LOCAL__
 #endif
 
-/* Shader Evaluation */
-
-typedef enum ShaderEvalType {
-  SHADER_EVAL_DISPLACE,
-  SHADER_EVAL_BACKGROUND,
-  /* bake types */
-  SHADER_EVAL_BAKE, /* no real shade, it's used in the code to
-                     * differentiate the type of shader eval from the above
-                     */
-  /* data passes */
-  SHADER_EVAL_NORMAL,
-  SHADER_EVAL_UV,
-  SHADER_EVAL_ROUGHNESS,
-  SHADER_EVAL_DIFFUSE_COLOR,
-  SHADER_EVAL_GLOSSY_COLOR,
-  SHADER_EVAL_TRANSMISSION_COLOR,
-  SHADER_EVAL_EMISSION,
-  SHADER_EVAL_AOV_COLOR,
-  SHADER_EVAL_AOV_VALUE,
-
-  /* light passes */
-  SHADER_EVAL_AO,
-  SHADER_EVAL_COMBINED,
-  SHADER_EVAL_SHADOW,
-  SHADER_EVAL_DIFFUSE,
-  SHADER_EVAL_GLOSSY,
-  SHADER_EVAL_TRANSMISSION,
-
-  /* extra */
-  SHADER_EVAL_ENVIRONMENT,
-} ShaderEvalType;
-
 /* Path Tracing
  * note we need to keep the u/v pairs at even values */
 
@@ -252,8 +185,7 @@ enum PathTraceDimension {
 
 enum SamplingPattern {
   SAMPLING_PATTERN_SOBOL = 0,
-  SAMPLING_PATTERN_CMJ = 1,
-  SAMPLING_PATTERN_PMJ = 2,
+  SAMPLING_PATTERN_PMJ = 1,
 
   SAMPLING_NUM_PATTERNS,
 };
@@ -261,7 +193,12 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-  /* Ray visibility. */
+  /* --------------------------------------------------------------------
+   * Ray visibility.
+   *
+   * NOTE: Recalculated after a surface bounce.
+   */
+
   PATH_RAY_CAMERA = (1 << 0),
   PATH_RAY_REFLECT = (1 << 1),
   PATH_RAY_TRANSMIT = (1 << 2),
@@ -269,57 +206,106 @@ enum PathRayFlag {
   PATH_RAY_GLOSSY = (1 << 4),
   PATH_RAY_SINGULAR = (1 << 5),
   PATH_RAY_TRANSPARENT = (1 << 6),
+  PATH_RAY_VOLUME_SCATTER = (1 << 7),
 
   /* Shadow ray visibility. */
-  PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7),
-  PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8),
-  PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | PATH_RAY_SHADOW_OPAQUE_CATCHER),
-  PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9),
-  PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10),
-  PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
-  PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+  PATH_RAY_SHADOW_OPAQUE = (1 << 8),
+  PATH_RAY_SHADOW_TRANSPARENT = (1 << 9),
   PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE | PATH_RAY_SHADOW_TRANSPARENT),
 
-  /* Unused, free to reuse. */
-  PATH_RAY_UNUSED = (1 << 11),
+  /* Special flag to tag unaligned BVH nodes.
+   * Only set and used in BVH nodes to distinguish how to interpret bounding box information stored
+   * in the node (either it should be intersected as AABB or as OBB). */
+  PATH_RAY_NODE_UNALIGNED = (1 << 10),
 
-  /* Ray visibility for volume scattering. */
-  PATH_RAY_VOLUME_SCATTER = (1 << 12),
-
-  /* Special flag to tag unaligned BVH nodes. */
-  PATH_RAY_NODE_UNALIGNED = (1 << 13),
+  /* Subset of flags used for ray visibility for intersection.
+   *
+   * NOTE: SHADOW_CATCHER macros below assume there are no more than
+   * 16 visibility bits. */
+  PATH_RAY_ALL_VISIBILITY = ((1 << 11) - 1),
 
-  PATH_RAY_ALL_VISIBILITY = ((1 << 14) - 1),
+  /* --------------------------------------------------------------------
+   * Path flags.
+   */
 
   /* Don't apply multiple importance sampling weights to emission from
    * lamp or surface hits, because they were not direct light sampled. */
-  PATH_RAY_MIS_SKIP = (1 << 14),
+  PATH_RAY_MIS_SKIP = (1 << 11),
+
   /* Diffuse bounce earlier in the path, skip SSS to improve performance
    * and avoid branching twice with disk sampling SSS. */
-  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 15),
+  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 12),
+
   /* Single pass has been written. */
-  PATH_RAY_SINGLE_PASS_DONE = (1 << 16),
-  /* Ray is behind a shadow catcher. */
-  PATH_RAY_SHADOW_CATCHER = (1 << 17),
-  /* Store shadow data for shadow catcher or denoising. */
-  PATH_RAY_STORE_SHADOW_INFO = (1 << 18),
+  PATH_RAY_SINGLE_PASS_DONE = (1 << 13),
+
   /* Zero background alpha, for camera or transparent glass rays. */
-  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 19),
+  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 14),
+
   /* Terminate ray immediately at next bounce. */
-  PATH_RAY_TERMINATE_IMMEDIATE = (1 << 20),
+  PATH_RAY_TERMINATE_ON_NEXT_SURFACE = (1 << 15),
+  PATH_RAY_TERMINATE_IN_NEXT_VOLUME = (1 << 16),
+
   /* Ray is to be terminated, but continue with transparent bounces and
    * emission as long as we encounter them. This is required to make the
    * MIS between direct and indirect light rays match, as shadow rays go
    * through transparent surfaces to reach emission too. */
-  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21),
+  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 17),
+
+  /* Terminate ray immediately after volume shading. */
+  PATH_RAY_TERMINATE_AFTER_VOLUME = (1 << 18),
+
   /* Ray is to be terminated. */
-  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE | PATH_RAY_TERMINATE_AFTER_TRANSPARENT),
+  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_ON_NEXT_SURFACE | PATH_RAY_TERMINATE_IN_NEXT_VOLUME |
+                        PATH_RAY_TERMINATE_AFTER_TRANSPARENT | PATH_RAY_TERMINATE_AFTER_VOLUME),
+
   /* Path and shader is being evaluated for direct lighting emission. */
-  PATH_RAY_EMISSION = (1 << 22)
+  PATH_RAY_EMISSION = (1 << 19),
+
+  /* Perform subsurface scattering. */
+  PATH_RAY_SUBSURFACE = (1 << 20),
+
+  /* Contribute to denoising features. */
+  PATH_RAY_DENOISING_FEATURES = (1 << 21),
+
+  /* Render pass categories. */
+  PATH_RAY_REFLECT_PASS = (1 << 22),
+  PATH_RAY_TRANSMISSION_PASS = (1 << 23),
+  PATH_RAY_VOLUME_PASS = (1 << 24),
+  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
+
+  /* Shadow ray is for a light or surface. */
+  PATH_RAY_SHADOW_FOR_LIGHT = (1 << 25),
+
+  /* A shadow catcher object was hit and the path was split into two. */
+  PATH_RAY_SHADOW_CATCHER_HIT = (1 << 26),
+
+  /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
+   * their dedicated pass for later division.
+   *
+   * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
+   * which is separate from the light passes. */
+  PATH_RAY_SHADOW_CATCHER_PASS = (1 << 27),
+
+  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1 << 28),
 };
 
+/* Configure ray visibility bits for rays and objects respectively,
+ * to make shadow catchers work.
+ *
+ * On shadow catcher paths we want to ignore any intersections with non-catchers,
+ * whereas on regular paths we want to intersect all objects. */
+
+#define SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) ((visibility) << 16)
+
+#define SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility) \
+  (((path_flag)&PATH_RAY_SHADOW_CATCHER_PASS) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : \
+                                                (visibility))
+
+#define SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility) \
+  (((is_shadow_catcher) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : 0) | (visibility))
+
 /* Closure Label */
 
 typedef enum ClosureLabel {
@@ -332,6 +318,7 @@ typedef enum ClosureLabel {
   LABEL_TRANSPARENT = 32,
   LABEL_VOLUME_SCATTER = 64,
   LABEL_TRANSMIT_TRANSPARENT = 128,
+  LABEL_SUBSURFACE_SCATTER = 256,
 } ClosureLabel;
 
 /* Render Passes */
@@ -339,17 +326,35 @@ typedef enum ClosureLabel {
 #define PASS_NAME_JOIN(a, b) a##_##b
 #define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32))
 
-#define PASSMASK_COMPONENT(comp) \
-  (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) | PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \
-   PASSMASK(PASS_NAME_JOIN(comp, COLOR)))
-
+// NOTE: Keep in sync with `Pass::get_type_enum()`.
 typedef enum PassType {
   PASS_NONE = 0,
 
-  /* Main passes */
+  /* Light Passes */
   PASS_COMBINED = 1,
-  PASS_DEPTH,
+  PASS_EMISSION,
+  PASS_BACKGROUND,
+  PASS_AO,
+  PASS_SHADOW,
+  PASS_DIFFUSE,
+  PASS_DIFFUSE_DIRECT,
+  PASS_DIFFUSE_INDIRECT,
+  PASS_GLOSSY,
+  PASS_GLOSSY_DIRECT,
+  PASS_GLOSSY_INDIRECT,
+  PASS_TRANSMISSION,
+  PASS_TRANSMISSION_DIRECT,
+  PASS_TRANSMISSION_INDIRECT,
+  PASS_VOLUME,
+  PASS_VOLUME_DIRECT,
+  PASS_VOLUME_INDIRECT,
+  PASS_CATEGORY_LIGHT_END = 31,
+
+  /* Data passes */
+  PASS_DEPTH = 32,
+  PASS_POSITION,
   PASS_NORMAL,
+  PASS_ROUGHNESS,
   PASS_UV,
   PASS_OBJECT_ID,
   PASS_MATERIAL_ID,
@@ -361,31 +366,35 @@ typedef enum PassType {
   PASS_AOV_VALUE,
   PASS_ADAPTIVE_AUX_BUFFER,
   PASS_SAMPLE_COUNT,
-  PASS_CATEGORY_MAIN_END = 31,
-
-  PASS_MIST = 32,
-  PASS_EMISSION,
-  PASS_BACKGROUND,
-  PASS_AO,
-  PASS_SHADOW,
-  PASS_LIGHT, /* no real pass, used to force use_light_pass */
-  PASS_DIFFUSE_DIRECT,
-  PASS_DIFFUSE_INDIRECT,
   PASS_DIFFUSE_COLOR,
-  PASS_GLOSSY_DIRECT,
-  PASS_GLOSSY_INDIRECT,
   PASS_GLOSSY_COLOR,
-  PASS_TRANSMISSION_DIRECT,
-  PASS_TRANSMISSION_INDIRECT,
   PASS_TRANSMISSION_COLOR,
-  PASS_VOLUME_DIRECT = 50,
-  PASS_VOLUME_INDIRECT,
   /* No Scatter color since it's tricky to define what it would even mean. */
-  PASS_CATEGORY_LIGHT_END = 63,
+  PASS_MIST,
+  PASS_DENOISING_NORMAL,
+  PASS_DENOISING_ALBEDO,
+
+  /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
+   * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
+   * result of this division is then to be multiplied with the backdrop. The alpha channel of this
+   * pass contains number of samples which contributed to the color components of the pass.
+   *
+   * PASS_SHADOW_CATCHER_SAMPLE_COUNT contains number of samples for which the path split
+   * happenned.
+   *
+   * PASS_SHADOW_CATCHER_MATTE contains pass which contains non-catcher objects. This pass is to be
+   * alpha-overed onto the backdrop (after multiplication). */
+  PASS_SHADOW_CATCHER,
+  PASS_SHADOW_CATCHER_SAMPLE_COUNT,
+  PASS_SHADOW_CATCHER_MATTE,
+
+  PASS_CATEGORY_DATA_END = 63,
 
   PASS_BAKE_PRIMITIVE,
   PASS_BAKE_DIFFERENTIAL,
-  PASS_CATEGORY_BAKE_END = 95
+  PASS_CATEGORY_BAKE_END = 95,
+
+  PASS_NUM,
 } PassType;
 
 #define PASS_ANY (~0)
@@ -398,158 +407,9 @@ typedef enum CryptomatteType {
   CRYPT_ACCURATE = (1 << 3),
 } CryptomatteType;
 
-typedef enum DenoisingPassOffsets {
-  DENOISING_PASS_NORMAL = 0,
-  DENOISING_PASS_NORMAL_VAR = 3,
-  DENOISING_PASS_ALBEDO = 6,
-  DENOISING_PASS_ALBEDO_VAR = 9,
-  DENOISING_PASS_DEPTH = 12,
-  DENOISING_PASS_DEPTH_VAR = 13,
-  DENOISING_PASS_SHADOW_A = 14,
-  DENOISING_PASS_SHADOW_B = 17,
-  DENOISING_PASS_COLOR = 20,
-  DENOISING_PASS_COLOR_VAR = 23,
-  DENOISING_PASS_CLEAN = 26,
-
-  DENOISING_PASS_PREFILTERED_DEPTH = 0,
-  DENOISING_PASS_PREFILTERED_NORMAL = 1,
-  DENOISING_PASS_PREFILTERED_SHADOWING = 4,
-  DENOISING_PASS_PREFILTERED_ALBEDO = 5,
-  DENOISING_PASS_PREFILTERED_COLOR = 8,
-  DENOISING_PASS_PREFILTERED_VARIANCE = 11,
-  DENOISING_PASS_PREFILTERED_INTENSITY = 14,
-
-  DENOISING_PASS_SIZE_BASE = 26,
-  DENOISING_PASS_SIZE_CLEAN = 3,
-  DENOISING_PASS_SIZE_PREFILTERED = 15,
-} DenoisingPassOffsets;
-
-typedef enum eBakePassFilter {
-  BAKE_FILTER_NONE = 0,
-  BAKE_FILTER_DIRECT = (1 << 0),
-  BAKE_FILTER_INDIRECT = (1 << 1),
-  BAKE_FILTER_COLOR = (1 << 2),
-  BAKE_FILTER_DIFFUSE = (1 << 3),
-  BAKE_FILTER_GLOSSY = (1 << 4),
-  BAKE_FILTER_TRANSMISSION = (1 << 5),
-  BAKE_FILTER_EMISSION = (1 << 6),
-  BAKE_FILTER_AO = (1 << 7),
-} eBakePassFilter;
-
-typedef enum BakePassFilterCombos {
-  BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE |
-                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION |
-                          BAKE_FILTER_AO),
-  BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION),
-} BakePassFilterCombos;
-
-typedef enum DenoiseFlag {
-  DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
-  DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
-  DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
-  DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
-  DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
-  DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
-  DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1,
-} DenoiseFlag;
-
-typedef ccl_addr_space struct PathRadianceState {
-#ifdef __PASSES__
-  float3 diffuse;
-  float3 glossy;
-  float3 transmission;
-  float3 volume;
-
-  float3 direct;
-#endif
-} PathRadianceState;
-
-typedef ccl_addr_space struct PathRadiance {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
-  float transparent;
-  float3 emission;
-#ifdef __PASSES__
-  float3 background;
-  float3 ao;
-
-  float3 indirect;
-  float3 direct_emission;
-
-  float3 color_diffuse;
-  float3 color_glossy;
-  float3 color_transmission;
-
-  float3 direct_diffuse;
-  float3 direct_glossy;
-  float3 direct_transmission;
-  float3 direct_volume;
-
-  float3 indirect_diffuse;
-  float3 indirect_glossy;
-  float3 indirect_transmission;
-  float3 indirect_volume;
-
-  float3 shadow;
-  float mist;
-#endif
-
-  struct PathRadianceState state;
-
-#ifdef __SHADOW_TRICKS__
-  /* Total light reachable across the path, ignoring shadow blocked queries. */
-  float3 path_total;
-  /* Total light reachable across the path with shadow blocked queries
-   * applied here.
-   *
-   * Dividing this figure by path_total will give estimate of shadow pass.
-   */
-  float3 path_total_shaded;
-
-  /* Color of the background on which shadow is alpha-overed. */
-  float3 shadow_background_color;
-
-  /* Path radiance sum and throughput at the moment when ray hits shadow
-   * catcher object.
-   */
-  float shadow_throughput;
-
-  /* Accumulated transparency along the path after shadow catcher bounce. */
-  float shadow_transparency;
-
-  /* Indicate if any shadow catcher data is set. */
-  int has_shadow_catcher;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  float3 denoising_normal;
-  float3 denoising_albedo;
-  float denoising_depth;
-#endif /* __DENOISING_FEATURES__ */
-} PathRadiance;
-
 typedef struct BsdfEval {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
   float3 diffuse;
-#ifdef __PASSES__
   float3 glossy;
-  float3 transmission;
-  float3 transparent;
-  float3 volume;
-#endif
-#ifdef __SHADOW_TRICKS__
-  float3 sum_no_mis;
-#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -564,8 +424,10 @@ typedef enum ShaderFlag {
   SHADER_EXCLUDE_TRANSMIT = (1 << 25),
   SHADER_EXCLUDE_CAMERA = (1 << 24),
   SHADER_EXCLUDE_SCATTER = (1 << 23),
+  SHADER_EXCLUDE_SHADOW_CATCHER = (1 << 22),
   SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE | SHADER_EXCLUDE_GLOSSY | SHADER_EXCLUDE_TRANSMIT |
-                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER),
+                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER |
+                        SHADER_EXCLUDE_SHADOW_CATCHER),
 
   SHADER_MASK = ~(SHADER_SMOOTH_NORMAL | SHADER_CAST_SHADOW | SHADER_AREA_LIGHT | SHADER_USE_MIS |
                   SHADER_EXCLUDE_ANY)
@@ -612,29 +474,14 @@ typedef struct differential {
 /* Ray */
 
 typedef struct Ray {
-/* TODO(sergey): This is only needed because current AMD
- * compiler has hard time building the kernel with this
- * reshuffle. And at the same time reshuffle will cause
- * less optimal CPU code in certain places.
- *
- * We'll get rid of this nasty exception once AMD compiler
- * is fixed.
- */
-#ifndef __KERNEL_OPENCL_AMD__
   float3 P;   /* origin */
   float3 D;   /* direction */
   float t;    /* length of the ray */
   float time; /* time (for motion blur) */
-#else
-  float t;    /* length of the ray */
-  float time; /* time (for motion blur) */
-  float3 P;   /* origin */
-  float3 D;   /* direction */
-#endif
 
 #ifdef __RAY_DIFFERENTIALS__
-  differential3 dP;
-  differential3 dD;
+  float dP;
+  float dD;
 #endif
 } Ray;
 
@@ -661,9 +508,6 @@ typedef enum PrimitiveType {
   PRIMITIVE_CURVE_RIBBON = (1 << 4),
   PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
   PRIMITIVE_VOLUME = (1 << 6),
-  /* Lamp primitive is not included below on purpose,
-   * since it is no real traceable primitive.
-   */
   PRIMITIVE_LAMP = (1 << 7),
 
   PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
@@ -672,16 +516,14 @@ typedef enum PrimitiveType {
   PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME),
   PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
                           PRIMITIVE_MOTION_CURVE_RIBBON),
-  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME),
+  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME |
+                   PRIMITIVE_LAMP),
 
-  /* Total number of different traceable primitives.
-   * NOTE: This is an actual value, not a bitflag.
-   */
-  PRIMITIVE_NUM_TOTAL = 7,
+  PRIMITIVE_NUM = 8,
 } PrimitiveType;
 
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM)
 
 typedef enum CurveShapeType {
   CURVE_RIBBON = 0,
@@ -760,20 +602,14 @@ typedef struct AttributeDescriptor {
 
 /* Closure data */
 
-#ifdef __MULTI_CLOSURE__
-#  ifdef __SPLIT_KERNEL__
-#    define MAX_CLOSURE 1
-#  else
-#    ifndef __MAX_CLOSURE__
-#      define MAX_CLOSURE 64
-#    else
-#      define MAX_CLOSURE __MAX_CLOSURE__
-#    endif
-#  endif
+#ifndef __MAX_CLOSURE__
+#  define MAX_CLOSURE 64
 #else
-#  define MAX_CLOSURE 1
+#  define MAX_CLOSURE __MAX_CLOSURE__
 #endif
 
+#define MAX_VOLUME_CLOSURE 8
+
 /* This struct is the base class for all closures. The common members are
  * duplicated in all derived classes since we don't have C++ in the kernel
  * yet, and because it lets us lay out the members to minimize padding. The
@@ -866,11 +702,14 @@ enum ShaderDataFlag {
   SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
   /* Shader has emission */
   SD_HAS_EMISSION = (1 << 29),
+  /* Shader has raytracing */
+  SD_HAS_RAYTRACE = (1 << 30),
 
   SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
                      SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
                      SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT |
-                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES)
+                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES | SD_HAS_EMISSION |
+                     SD_HAS_RAYTRACE)
 };
 
 /* Object flags. */
@@ -955,19 +794,19 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
 #endif
 
 #ifdef __OBJECT_MOTION__
-  /* object <-> world space transformations, cached to avoid
-   * re-interpolating them constantly for shading */
-  Transform ob_tfm;
-  Transform ob_itfm;
+  /* Object <-> world space transformations for motion blur, cached to avoid
+   * re-interpolating them constantly for shading. */
+  Transform ob_tfm_motion;
+  Transform ob_itfm_motion;
 #endif
 
   /* ray start position, only set for backgrounds */
   float3 ray_P;
-  differential3 ray_dP;
+  float ray_dP;
 
 #ifdef __OSL__
-  struct KernelGlobals *osl_globals;
-  struct PathState *osl_path_state;
+  const struct KernelGlobals *osl_globals;
+  const struct IntegratorStateCPU *osl_path_state;
 #endif
 
   /* LCG state for closures that require additional random numbers. */
@@ -976,7 +815,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
   /* Closure data, we store a fixed array of closures */
   int num_closure;
   int num_closure_left;
-  float randb_closure;
   float3 svm_closure_weight;
 
   /* Closure weights summed directly, so we can evaluate
@@ -998,7 +836,22 @@ typedef ccl_addr_space struct ccl_align(16) ShaderDataTinyStorage
 ShaderDataTinyStorage;
 #define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData *)shader_data_tiny_storage)
 
-/* Path State */
+/* Compact volume closures storage.
+ *
+ * Used for decoupled direct/indirect light closure storage. */
+
+ccl_addr_space struct ShaderVolumeClosure {
+  float3 weight;
+  float sample_weight;
+  float g;
+};
+
+ccl_addr_space struct ShaderVolumePhases {
+  ShaderVolumeClosure closure[MAX_VOLUME_CLOSURE];
+  int num_closure;
+};
+
+/* Volume Stack */
 
 #ifdef __VOLUME__
 typedef struct VolumeStack {
@@ -1007,53 +860,6 @@ typedef struct VolumeStack {
 } VolumeStack;
 #endif
 
-typedef struct PathState {
-  /* see enum PathRayFlag */
-  int flag;
-
-  /* random number generator state */
-  uint rng_hash;       /* per pixel hash */
-  int rng_offset;      /* dimension offset */
-  int sample;          /* path sample number */
-  int num_samples;     /* total number of times this path will be sampled */
-  float branch_factor; /* number of branches in indirect paths */
-
-  /* bounce counting */
-  int bounce;
-  int diffuse_bounce;
-  int glossy_bounce;
-  int transmission_bounce;
-  int transparent_bounce;
-
-#ifdef __DENOISING_FEATURES__
-  float denoising_feature_weight;
-  float3 denoising_feature_throughput;
-#endif /* __DENOISING_FEATURES__ */
-
-  /* multiple importance sampling */
-  float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
-  float ray_pdf;     /* last bounce pdf */
-#ifdef __LAMP_MIS__
-  float ray_t; /* accumulated distance through transparent surfaces */
-#endif
-
-  /* volume rendering */
-#ifdef __VOLUME__
-  int volume_bounce;
-  int volume_bounds_bounce;
-  VolumeStack volume_stack[VOLUME_STACK_SIZE];
-#endif
-} PathState;
-
-#ifdef __VOLUME__
-typedef struct VolumeState {
-#  ifdef __SPLIT_KERNEL__
-#  else
-  PathState ps;
-#  endif
-} VolumeState;
-#endif
-
 /* Struct to gather multiple nearby intersections. */
 typedef struct LocalIntersection {
   Ray ray;
@@ -1064,20 +870,6 @@ typedef struct LocalIntersection {
   float3 Ng[LOCAL_MAX_HITS];
 } LocalIntersection;
 
-/* Subsurface */
-
-/* Struct to gather SSS indirect rays and delay tracing them. */
-typedef struct SubsurfaceIndirectRays {
-  PathState state[BSSRDF_MAX_HITS];
-
-  int num_rays;
-
-  struct Ray rays[BSSRDF_MAX_HITS];
-  float3 throughputs[BSSRDF_MAX_HITS];
-  struct PathRadianceState L_state[BSSRDF_MAX_HITS];
-} SubsurfaceIndirectRays;
-static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high.");
-
 /* Constant Kernel Data
  *
  * These structs are passed from CPU to various devices, and the struct layout
@@ -1128,7 +920,7 @@ typedef struct KernelCamera {
 
   /* render size */
   float width, height;
-  int resolution;
+  int pad1;
 
   /* anamorphic lens bokeh */
   float inv_aperture_ratio;
@@ -1169,11 +961,12 @@ typedef struct KernelFilm {
 
   int light_pass_flag;
   int pass_stride;
-  int use_light_pass;
 
   int pass_combined;
   int pass_depth;
+  int pass_position;
   int pass_normal;
+  int pass_roughness;
   int pass_motion;
 
   int pass_motion_weight;
@@ -1202,7 +995,13 @@ typedef struct KernelFilm {
 
   int pass_shadow;
   float pass_shadow_scale;
+
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+
   int filter_table_offset;
+
   int cryptomatte_passes;
   int cryptomatte_depth;
   int pass_cryptomatte;
@@ -1215,15 +1014,11 @@ typedef struct KernelFilm {
   float mist_inv_depth;
   float mist_falloff;
 
-  int pass_denoising_data;
-  int pass_denoising_clean;
-  int denoising_flags;
+  int pass_denoising_normal;
+  int pass_denoising_albedo;
 
   int pass_aov_color;
   int pass_aov_value;
-  int pass_aov_color_num;
-  int pass_aov_value_num;
-  int pad1, pad2, pad3;
 
   /* XYZ to rendering color space transform. float4 instead of float3 to
    * ensure consistent padding/alignment across devices. */
@@ -1234,19 +1029,54 @@ typedef struct KernelFilm {
 
   int pass_bake_primitive;
   int pass_bake_differential;
-  int pad;
 
-  /* viewport rendering options */
-  int display_pass_stride;
-  int display_pass_components;
-  int display_divide_pass_stride;
-  int use_display_exposure;
-  int use_display_pass_alpha;
+  int use_approximate_shadow_catcher;
 
-  int pad4, pad5, pad6;
+  int pad1, pad2, pad3;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
+typedef struct KernelFilmConvert {
+  int pass_offset;
+  int pass_stride;
+
+  int pass_use_exposure;
+  int pass_use_filter;
+
+  int pass_divide;
+  int pass_indirect;
+
+  int pass_combined;
+  int pass_sample_count;
+  int pass_adaptive_aux_buffer;
+  int pass_motion_weight;
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+  int pass_background;
+
+  float scale;
+  float exposure;
+  float scale_exposure;
+
+  int use_approximate_shadow_catcher;
+  int use_approximate_shadow_catcher_background;
+  int show_active_pixels;
+
+  /* Number of components to write to. */
+  int num_components;
+
+  /* Number of floats per pixel. When zero is the same as `num_components`.
+   * NOTE: Is ignored for half4 destination. */
+  int pixel_stride;
+
+  int is_denoised;
+
+  /* Padding. */
+  int pad1;
+} KernelFilmConvert;
+static_assert_align(KernelFilmConvert, 16);
+
 typedef struct KernelBackground {
   /* only shader index */
   int surface_shader;
@@ -1255,11 +1085,6 @@ typedef struct KernelBackground {
   int transparent;
   float transparent_roughness_squared_threshold;
 
-  /* ambient occlusion */
-  float ao_factor;
-  float ao_distance;
-  float ao_bounces_factor;
-
   /* portal sampling */
   float portal_weight;
   int num_portals;
@@ -1277,13 +1102,15 @@ typedef struct KernelBackground {
   int map_res_y;
 
   int use_mis;
+
+  /* Padding */
+  int pad1, pad2, pad3;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
 typedef struct KernelIntegrator {
   /* emission */
   int use_direct_light;
-  int use_ambient_occlusion;
   int num_distribution;
   int num_all_lights;
   float pdf_triangles;
@@ -1299,7 +1126,10 @@ typedef struct KernelIntegrator {
   int max_transmission_bounce;
   int max_volume_bounce;
 
+  /* AO bounces */
   int ao_bounces;
+  float ao_bounces_distance;
+  float ao_bounces_factor;
 
   /* transparent */
   int transparent_min_bounce;
@@ -1318,39 +1148,20 @@ typedef struct KernelIntegrator {
   float sample_clamp_direct;
   float sample_clamp_indirect;
 
-  /* branched path */
-  int branched;
-  int volume_decoupled;
-  int diffuse_samples;
-  int glossy_samples;
-  int transmission_samples;
-  int ao_samples;
-  int mesh_light_samples;
-  int subsurface_samples;
-  int sample_all_lights_direct;
-  int sample_all_lights_indirect;
-
   /* mis */
   int use_lamp_mis;
 
   /* sampler */
   int sampling_pattern;
-  int aa_samples;
-  int adaptive_min_samples;
-  int adaptive_step;
-  int adaptive_stop_per_sample;
-  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
   int volume_max_steps;
   float volume_step_rate;
-  int volume_samples;
-
-  int start_sample;
 
-  int max_closures;
+  int has_shadow_catcher;
 
+  /* padding */
   int pad1, pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
@@ -1401,14 +1212,19 @@ typedef struct KernelTables {
 static_assert_align(KernelTables, 16);
 
 typedef struct KernelBake {
+  int use;
   int object_index;
   int tri_offset;
-  int type;
-  int pass_filter;
+  int pad1;
 } KernelBake;
 static_assert_align(KernelBake, 16);
 
 typedef struct KernelData {
+  uint kernel_features;
+  uint max_closures;
+  uint max_shaders;
+  uint pad;
+
   KernelCamera cam;
   KernelFilm film;
   KernelBackground background;
@@ -1485,11 +1301,10 @@ typedef struct KernelLight {
   int type;
   float co[3];
   int shader_id;
-  int samples;
   float max_bounces;
   float random;
   float strength[3];
-  float pad1;
+  float pad1, pad2;
   Transform tfm;
   Transform itfm;
   union {
@@ -1539,110 +1354,6 @@ typedef struct KernelShader {
 } KernelShader;
 static_assert_align(KernelShader, 16);
 
-/* Declarations required for split kernel */
-
-/* Macro for queues */
-/* Value marking queue's empty slot */
-#define QUEUE_EMPTY_SLOT -1
-
-/*
- * Queue 1 - Active rays
- * Queue 2 - Background queue
- * Queue 3 - Shadow ray cast kernel - AO
- * Queue 4 - Shadow ray cast kernel - direct lighting
- */
-
-/* Queue names */
-enum QueueNumber {
-  /* All active rays and regenerated rays are enqueued here. */
-  QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0,
-
-  /* All
-   * 1. Background-hit rays,
-   * 2. Rays that has exited path-iteration but needs to update output buffer
-   * 3. Rays to be regenerated
-   * are enqueued here.
-   */
-  QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contribution for AO are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contributing for direct lighting are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-
-  /* Rays sorted according to shader->id */
-  QUEUE_SHADER_SORTED_RAYS,
-
-#ifdef __BRANCHED_PATH__
-  /* All rays moving to next iteration of the indirect loop for light */
-  QUEUE_LIGHT_INDIRECT_ITER,
-  /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
-  QUEUE_INACTIVE_RAYS,
-#  ifdef __VOLUME__
-  /* All rays moving to next iteration of the indirect loop for volumes */
-  QUEUE_VOLUME_INDIRECT_ITER,
-#  endif
-#  ifdef __SUBSURFACE__
-  /* All rays moving to next iteration of the indirect loop for subsurface */
-  QUEUE_SUBSURFACE_INDIRECT_ITER,
-#  endif
-#endif /* __BRANCHED_PATH__ */
-
-  NUM_QUEUES
-};
-
-/* We use RAY_STATE_MASK to get ray_state */
-#define RAY_STATE_MASK 0x0F
-#define RAY_FLAG_MASK 0xF0
-enum RayState {
-  RAY_INVALID = 0,
-  /* Denotes ray is actively involved in path-iteration. */
-  RAY_ACTIVE,
-  /* Denotes ray has completed processing all samples and is inactive. */
-  RAY_INACTIVE,
-  /* Denotes ray has exited path-iteration and needs to update output buffer. */
-  RAY_UPDATE_BUFFER,
-  /* Denotes ray needs to skip most surface shader work. */
-  RAY_HAS_ONLY_VOLUME,
-  /* Denotes ray has hit background */
-  RAY_HIT_BACKGROUND,
-  /* Denotes ray has to be regenerated */
-  RAY_TO_REGENERATE,
-  /* Denotes ray has been regenerated */
-  RAY_REGENERATED,
-  /* Denotes ray is moving to next iteration of the branched indirect loop */
-  RAY_LIGHT_INDIRECT_NEXT_ITER,
-  RAY_VOLUME_INDIRECT_NEXT_ITER,
-  RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
-
-  /* Ray flags */
-
-  /* Flags to denote that the ray is currently evaluating the branched indirect loop */
-  RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
-  RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
-  RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
-  RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT |
-                           RAY_BRANCHED_SUBSURFACE_INDIRECT),
-
-  /* Ray is evaluating an iteration of an indirect loop for another thread */
-  RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
-};
-
-#define ASSIGN_RAY_STATE(ray_state, ray_index, state) \
-  (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) \
-  ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
-#define ADD_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] | flag))
-#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
-#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
-
 /* Patches */
 
 #define PATCH_MAX_CONTROL_VERTS 16
@@ -1655,7 +1366,7 @@ enum RayState {
 
 /* Work Tiles */
 
-typedef struct WorkTile {
+typedef struct KernelWorkTile {
   uint x, y, w, h;
 
   uint start_sample;
@@ -1664,13 +1375,172 @@ typedef struct WorkTile {
   int offset;
   uint stride;
 
-  ccl_global float *buffer;
-} WorkTile;
+  /* Precalculated parameters used by init_from_camera kernel on GPU. */
+  int path_index_offset;
+  int work_size;
+} KernelWorkTile;
+
+/* Shader Evaluation.
+ *
+ * Position on a primitive on an object at which we want to evaluate the
+ * shader for e.g. mesh displacement or light importance map. */
+
+typedef struct KernelShaderEvalInput {
+  int object;
+  int prim;
+  float u, v;
+} KernelShaderEvalInput;
+static_assert_align(KernelShaderEvalInput, 16);
 
 /* Pre-computed sample table sizes for PMJ02 sampler. */
-#define NUM_PMJ_SAMPLES (64 * 64)
-#define NUM_PMJ_PATTERNS 48
+#define NUM_PMJ_DIVISIONS 32
+#define NUM_PMJ_SAMPLES ((NUM_PMJ_DIVISIONS) * (NUM_PMJ_DIVISIONS))
+#define NUM_PMJ_PATTERNS 1
 
-CCL_NAMESPACE_END
+/* Device kernels.
+ *
+ * Identifier for kernels that can be executed in device queues.
+ *
+ * Some implementation details.
+ *
+ * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
+ * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
+
+typedef enum DeviceKernel {
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL,
+
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+  DEVICE_KERNEL_INTEGRATOR_RESET,
+  DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,
+
+  DEVICE_KERNEL_SHADER_EVAL_DISPLACE,
+  DEVICE_KERNEL_SHADER_EVAL_BACKGROUND,
+
+#define DECLARE_FILM_CONVERT_KERNEL(variant) \
+  DEVICE_KERNEL_FILM_CONVERT_##variant, DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA
+
+  DECLARE_FILM_CONVERT_KERNEL(DEPTH),
+  DECLARE_FILM_CONVERT_KERNEL(MIST),
+  DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT),
+  DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
+  DECLARE_FILM_CONVERT_KERNEL(MOTION),
+  DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER_MATTE_WITH_SHADOW),
+  DECLARE_FILM_CONVERT_KERNEL(COMBINED),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT4),
+
+#undef DECLARE_FILM_CONVERT_KERNEL
+
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y,
+
+  DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS,
+  DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO,
+  DEVICE_KERNEL_FILTER_COLOR_PREPROCESS,
+  DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS,
+
+  DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS,
+
+  DEVICE_KERNEL_PREFIX_SUM,
+
+  DEVICE_KERNEL_NUM,
+} DeviceKernel;
+
+enum {
+  DEVICE_KERNEL_INTEGRATOR_NUM = DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL + 1,
+};
+
+/* Kernel Features */
+
+enum KernelFeatureFlag : unsigned int {
+  /* Shader nodes. */
+  KERNEL_FEATURE_NODE_BSDF = (1U << 0U),
+  KERNEL_FEATURE_NODE_EMISSION = (1U << 1U),
+  KERNEL_FEATURE_NODE_VOLUME = (1U << 2U),
+  KERNEL_FEATURE_NODE_HAIR = (1U << 3U),
+  KERNEL_FEATURE_NODE_BUMP = (1U << 4U),
+  KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 5U),
+  KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 6U),
+  KERNEL_FEATURE_NODE_RAYTRACE = (1U << 7U),
+
+  /* Use denoising kernels and output denoising passes. */
+  KERNEL_FEATURE_DENOISING = (1U << 8U),
+
+  /* Use path tracing kernels. */
+  KERNEL_FEATURE_PATH_TRACING = (1U << 9U),
 
-#endif /*  __KERNEL_TYPES_H__ */
+  /* BVH/sampling kernel features. */
+  KERNEL_FEATURE_HAIR = (1U << 10U),
+  KERNEL_FEATURE_HAIR_THICK = (1U << 11U),
+  KERNEL_FEATURE_OBJECT_MOTION = (1U << 12U),
+  KERNEL_FEATURE_CAMERA_MOTION = (1U << 13U),
+
+  /* Denotes whether baking functionality is needed. */
+  KERNEL_FEATURE_BAKING = (1U << 14U),
+
+  /* Use subsurface scattering materials. */
+  KERNEL_FEATURE_SUBSURFACE = (1U << 15U),
+
+  /* Use volume materials. */
+  KERNEL_FEATURE_VOLUME = (1U << 16U),
+
+  /* Use OpenSubdiv patch evaluation */
+  KERNEL_FEATURE_PATCH_EVALUATION = (1U << 17U),
+
+  /* Use Transparent shadows */
+  KERNEL_FEATURE_TRANSPARENT = (1U << 18U),
+
+  /* Use shadow catcher. */
+  KERNEL_FEATURE_SHADOW_CATCHER = (1U << 19U),
+
+  /* Per-uber shader usage flags. */
+  KERNEL_FEATURE_PRINCIPLED = (1U << 20U),
+
+  /* Light render passes. */
+  KERNEL_FEATURE_LIGHT_PASSES = (1U << 21U),
+
+  /* Shadow render pass. */
+  KERNEL_FEATURE_SHADOW_PASS = (1U << 22U),
+};
+
+/* Shader node feature mask, to specialize shader evaluation for kernels. */
+
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \
+  (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \
+   KERNEL_FEATURE_NODE_HAIR | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
+   KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE \
+  (KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW | KERNEL_FEATURE_NODE_RAYTRACE)
+#define KERNEL_FEATURE_NODE_MASK_VOLUME \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_DISPLACEMENT \
+  (KERNEL_FEATURE_NODE_VORONOI_EXTRA | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE)
+#define KERNEL_FEATURE_NODE_MASK_BUMP KERNEL_FEATURE_NODE_MASK_DISPLACEMENT
+
+#define KERNEL_NODES_FEATURE(feature) ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
deleted file mode 100644
index f6b34be040e..00000000000
--- a/intern/cycles/kernel/kernel_volume.h
+++ /dev/null
@@ -1,1440 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
- * and precision issues.
- * todo: this value could be tweaked or turned into a probability to avoid unnecessary
- * work in volumes and subsurface scattering. */
-#define VOLUME_THROUGHPUT_EPSILON 1e-6f
-
-/* Events for probalistic scattering */
-
-typedef enum VolumeIntegrateResult {
-  VOLUME_PATH_SCATTERED = 0,
-  VOLUME_PATH_ATTENUATED = 1,
-  VOLUME_PATH_MISSED = 2
-} VolumeIntegrateResult;
-
-/* Volume shader properties
- *
- * extinction coefficient = absorption coefficient + scattering coefficient
- * sigma_t = sigma_a + sigma_s */
-
-typedef struct VolumeShaderCoefficients {
-  float3 sigma_t;
-  float3 sigma_s;
-  float3 emission;
-} VolumeShaderCoefficients;
-
-#ifdef __VOLUME__
-
-/* evaluate shader to get extinction coefficient at P */
-ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 P,
-                                                       float3 *extinction)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
-
-  if (sd->flag & SD_EXTINCTION) {
-    const float density = object_volume_density(kg, sd->object);
-    *extinction = sd->closure_transparent_extinction * density;
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-
-/* evaluate shader to get absorption, scattering and emission at P */
-ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
-                                            ShaderData *sd,
-                                            ccl_addr_space PathState *state,
-                                            float3 P,
-                                            VolumeShaderCoefficients *coeff)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
-
-  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION)))
-    return false;
-
-  coeff->sigma_s = zero_float3();
-  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
-  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
-
-  if (sd->flag & SD_SCATTER) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_VOLUME(sc->type))
-        coeff->sigma_s += sc->weight;
-    }
-  }
-
-  const float density = object_volume_density(kg, sd->object);
-  coeff->sigma_s *= density;
-  coeff->sigma_t *= density;
-  coeff->emission *= density;
-
-  return true;
-}
-
-#endif /* __VOLUME__ */
-
-ccl_device float3 volume_color_transmittance(float3 sigma, float t)
-{
-  return exp3(-sigma * t);
-}
-
-ccl_device float kernel_volume_channel_get(float3 value, int channel)
-{
-  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
-}
-
-#ifdef __VOLUME__
-
-ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
-{
-  float step_size = FLT_MAX;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    bool heterogeneous = false;
-
-    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
-      heterogeneous = true;
-    }
-    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
-      /* We want to render world or objects without any volume grids
-       * as homogeneous, but can only verify this at run-time since other
-       * heterogeneous volume objects may be using the same shader. */
-      int object = stack[i].object;
-      if (object != OBJECT_NONE) {
-        int object_flag = kernel_tex_fetch(__object_flag, object);
-        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
-          heterogeneous = true;
-        }
-      }
-    }
-
-    if (heterogeneous) {
-      float object_step_size = object_volume_step_size(kg, stack[i].object);
-      object_step_size *= kernel_data.integrator.volume_step_rate;
-      step_size = fminf(object_step_size, step_size);
-    }
-  }
-
-  return step_size;
-}
-
-ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
-{
-  if (kernel_data.integrator.num_all_lights == 0)
-    return 0;
-
-  int method = -1;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    if (shader_flag & SD_VOLUME_MIS) {
-      return SD_VOLUME_MIS;
-    }
-    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
-      if (method == 0)
-        return SD_VOLUME_MIS;
-
-      method = SD_VOLUME_EQUIANGULAR;
-    }
-    else {
-      if (method == SD_VOLUME_EQUIANGULAR)
-        return SD_VOLUME_MIS;
-
-      method = 0;
-    }
-  }
-
-  return method;
-}
-
-ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               const float object_step_size,
-                                               float t,
-                                               float *step_size,
-                                               float *step_shade_offset,
-                                               float *steps_offset)
-{
-  const int max_steps = kernel_data.integrator.volume_max_steps;
-  float step = min(object_step_size, t);
-
-  /* compute exact steps in advance for malloc */
-  if (t > max_steps * step) {
-    step = t / (float)max_steps;
-  }
-
-  *step_size = step;
-
-  /* Perform shading at this offset within a step, to integrate over
-   * over the entire step segment. */
-  *step_shade_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4);
-
-  /* Shift starting point of all segment by this random amount to avoid
-   * banding artifacts from the volume bounding shape. */
-  *steps_offset = path_state_rng_1D_hash(kg, state, 0x3d22c7b3);
-}
-
-/* Volume Shadows
- *
- * These functions are used to attenuate shadow rays to lights. Both absorption
- * and scattering will block light, represented by the extinction coefficient. */
-
-/* homogeneous volume: assume shader evaluation at the starts gives
- * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state,
-                                                 Ray *ray,
-                                                 ShaderData *sd,
-                                                 float3 *throughput)
-{
-  float3 sigma_t = zero_float3();
-
-  if (volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
-    *throughput *= volume_color_transmittance(sigma_t, ray->t);
-}
-
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   Ray *ray,
-                                                   ShaderData *sd,
-                                                   float3 *throughput,
-                                                   const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * For shadows we do not offset all segments, since the starting point is
-   * already a random distance inside the volume. It also appears to create
-   * banding artifacts for unknown reasons. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, unused;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &unused);
-  const float steps_offset = 1.0f;
-
-  /* compute extinction at the start */
-  float t = 0.0f;
-
-  float3 sum = zero_float3();
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    float3 sigma_t = zero_float3();
-
-    /* compute attenuation over segment */
-    if (volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
-      /* Compute expf() only for every Nth step, to save some calculations
-       * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
-       * check then. */
-      sum += (-sigma_t * dt);
-      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
-        tp = *throughput * exp3(sum);
-
-        /* stop if nearly all light is blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON)
-          break;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t) {
-      /* Update throughput in case we haven't done it above */
-      tp = *throughput * exp3(sum);
-      break;
-    }
-  }
-
-  *throughput = tp;
-}
-
-/* get the volume attenuation over line segment defined by ray, with the
- * assumption that there are no surfaces blocking light between the endpoints */
-#  if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg,
-                                            ShaderData *shadow_sd,
-                                            ccl_addr_space PathState *state,
-                                            Ray *ray,
-                                            float3 *throughput)
-{
-  optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput);
-}
-extern "C" __device__ void __direct_callable__kernel_volume_shadow(
-#  else
-ccl_device_noinline void kernel_volume_shadow(
-#  endif
-    KernelGlobals *kg,
-    ShaderData *shadow_sd,
-    ccl_addr_space PathState *state,
-    Ray *ray,
-    float3 *throughput)
-{
-  shader_setup_from_volume(kg, shadow_sd, ray);
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  if (step_size != FLT_MAX)
-    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size);
-  else
-    kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
-}
-
-#endif /* __VOLUME__ */
-
-/* Equi-angular sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, float xi, float *pdf)
-{
-  float t = ray->t;
-
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
-  if (UNLIKELY(theta_b == theta_a)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return min(t, delta + t_); /* min is only for float precision errors */
-}
-
-ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
-{
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    return 0.0f;
-  }
-
-  float t = ray->t;
-  float t_ = sample_t - delta;
-
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  if (UNLIKELY(theta_b == theta_a)) {
-    return 0.0f;
-  }
-
-  float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return pdf;
-}
-
-/* Distance sampling */
-
-ccl_device float kernel_volume_distance_sample(
-    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
-{
-  /* xi is [0, 1[ so log(0) should never happen, division by zero is
-   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
-  float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float sample_transmittance = kernel_volume_channel_get(full_transmittance, channel);
-
-  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
-
-  *transmittance = volume_color_transmittance(sigma_t, sample_t);
-  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
-
-  /* todo: optimization: when taken together with hit/miss decision,
-   * the full_transmittance cancels out drops out and xi does not
-   * need to be remapped */
-
-  return sample_t;
-}
-
-ccl_device float3 kernel_volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
-{
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
-
-  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
-}
-
-/* Emission */
-
-ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coeff,
-                                                   int closure_flag,
-                                                   float3 transmittance,
-                                                   float t)
-{
-  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
-   * this goes to E * t as sigma_t goes to zero
-   *
-   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
-  float3 emission = coeff->emission;
-
-  if (closure_flag & SD_EXTINCTION) {
-    float3 sigma_t = coeff->sigma_t;
-
-    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
-    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
-    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
-  }
-  else
-    emission *= t;
-
-  return emission;
-}
-
-/* Volume Path */
-
-ccl_device int kernel_volume_sample_channel(float3 albedo,
-                                            float3 throughput,
-                                            float rand,
-                                            float3 *pdf)
-{
-  /* Sample color channel proportional to throughput and single scattering
-   * albedo, to significantly reduce noise with many bounce, following:
-   *
-   * "Practical and Controllable Subsurface Scattering for Production Path
-   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-  float3 weights = fabs(throughput * albedo);
-  float sum_weights = weights.x + weights.y + weights.z;
-  float3 weights_pdf;
-
-  if (sum_weights > 0.0f) {
-    weights_pdf = weights / sum_weights;
-  }
-  else {
-    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
-  }
-
-  *pdf = weights_pdf;
-
-  /* OpenCL does not support -> on float3, so don't use pdf->x. */
-  if (rand < weights_pdf.x) {
-    return 0;
-  }
-  else if (rand < weights_pdf.x + weights_pdf.y) {
-    return 1;
-  }
-  else {
-    return 2;
-  }
-}
-
-#ifdef __VOLUME__
-
-/* homogeneous volume: assume shader evaluation at the start gives
- * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-                                    ccl_addr_space PathState *state,
-                                    Ray *ray,
-                                    ShaderData *sd,
-                                    PathRadiance *L,
-                                    ccl_addr_space float3 *throughput,
-                                    bool probalistic_scatter)
-{
-  VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-  if (!volume_shader_sample(kg, sd, state, ray->P, &coeff))
-    return VOLUME_PATH_MISSED;
-
-  int closure_flag = sd->flag;
-  float t = ray->t;
-  float3 new_tp;
-
-#  ifdef __VOLUME_SCATTER__
-  /* randomly scatter, and if we do t is shortened */
-  if (closure_flag & SD_SCATTER) {
-    /* Sample channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf);
-
-    /* decide if we will hit or miss */
-    bool scatter = true;
-    float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    if (probalistic_scatter) {
-      float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-      float sample_transmittance = expf(-sample_sigma_t * t);
-
-      if (1.0f - xi >= sample_transmittance) {
-        scatter = true;
-
-        /* rescale random number so we can reuse it */
-        xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-      }
-      else
-        scatter = false;
-    }
-
-    if (scatter) {
-      /* scattering */
-      float3 pdf;
-      float3 transmittance;
-      float sample_t;
-
-      /* distance sampling */
-      sample_t = kernel_volume_distance_sample(
-          ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf);
-
-      /* modify pdf for hit/miss decision */
-      if (probalistic_scatter)
-        pdf *= one_float3() - volume_color_transmittance(coeff.sigma_t, t);
-
-      new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf);
-      t = sample_t;
-    }
-    else {
-      /* no scattering */
-      float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-      float pdf = dot(channel_pdf, transmittance);
-      new_tp = *throughput * transmittance / pdf;
-    }
-  }
-  else
-#  endif
-      if (closure_flag & SD_EXTINCTION) {
-    /* absorption only, no sampling needed */
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-    new_tp = *throughput * transmittance;
-  }
-  else {
-    new_tp = *throughput;
-  }
-
-  /* integrate emission attenuated by extinction */
-  if (L && (closure_flag & SD_EMISSION)) {
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t);
-    float3 emission = kernel_volume_emission_integrate(
-        &coeff, closure_flag, transmittance, ray->t);
-    path_radiance_accum_emission(kg, L, state, *throughput, emission);
-  }
-
-  /* modify throughput */
-  if (closure_flag & SD_EXTINCTION) {
-    *throughput = new_tp;
-
-    /* prepare to scatter to new direction */
-    if (t < ray->t) {
-      /* adjust throughput and move to new location */
-      sd->P = ray->P + t * ray->D;
-
-      return VOLUME_PATH_SCATTERED;
-    }
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* heterogeneous volume distance sampling: integrate stepping through the
- * volume until we reach the end, get absorbed entirely, or run out of
- * iterations. this does probabilistically scatter or get transmitted through
- * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space float3 *throughput,
-                                               const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * Using a different step offset for the first step avoids banding artifacts. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, steps_offset;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-  /* compute coefficients at the start */
-  float t = 0.0f;
-  float3 accum_transmittance = one_float3();
-
-  /* pick random color channel, we use the Veach one-sample
-   * model with balance heuristic for the channels */
-  float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-  float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-  bool has_scatter = false;
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 new_tp;
-      float3 transmittance;
-      bool scatter = false;
-
-      /* distance sampling */
-#  ifdef __VOLUME_SCATTER__
-      if ((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) {
-        has_scatter = true;
-
-        /* Sample channel, use MIS with balance heuristic. */
-        float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-        float3 channel_pdf;
-        int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf);
-
-        /* compute transmittance over full step */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-
-        /* decide if we will scatter or continue */
-        float sample_transmittance = kernel_volume_channel_get(transmittance, channel);
-
-        if (1.0f - xi >= sample_transmittance) {
-          /* compute sampling distance */
-          float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-          float new_dt = -logf(1.0f - xi) / sample_sigma_t;
-          new_t = t + new_dt;
-
-          /* transmittance and pdf */
-          float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
-          float3 pdf = coeff.sigma_t * new_transmittance;
-
-          /* throughput */
-          new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf);
-          scatter = true;
-        }
-        else {
-          /* throughput */
-          float pdf = dot(channel_pdf, transmittance);
-          new_tp = tp * transmittance / pdf;
-
-          /* remap xi so we can reuse it and keep thing stratified */
-          xi = 1.0f - (1.0f - xi) / sample_transmittance;
-        }
-      }
-      else
-#  endif
-          if (closure_flag & SD_EXTINCTION) {
-        /* absorption only, no sampling needed */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-        new_tp = tp * transmittance;
-      }
-      else {
-        transmittance = zero_float3();
-        new_tp = tp;
-      }
-
-      /* integrate emission attenuated by absorption */
-      if (L && (closure_flag & SD_EMISSION)) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        path_radiance_accum_emission(kg, L, state, tp, emission);
-      }
-
-      /* modify throughput */
-      if (closure_flag & SD_EXTINCTION) {
-        tp = new_tp;
-
-        /* stop if nearly all light blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON) {
-          tp = zero_float3();
-          break;
-        }
-      }
-
-      /* prepare to scatter to new direction */
-      if (scatter) {
-        /* adjust throughput and move to new location */
-        sd->P = ray->P + new_t * ray->D;
-        *throughput = tp;
-
-        return VOLUME_PATH_SCATTERED;
-      }
-      else {
-        /* accumulate transmittance */
-        accum_transmittance *= transmittance;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-  }
-
-  *throughput = tp;
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints. distance sampling is used to decide if we will
- * scatter or not. */
-ccl_device_noinline_cpu VolumeIntegrateResult
-kernel_volume_integrate(KernelGlobals *kg,
-                        ccl_addr_space PathState *state,
-                        ShaderData *sd,
-                        Ray *ray,
-                        PathRadiance *L,
-                        ccl_addr_space float3 *throughput,
-                        float step_size)
-{
-  shader_setup_from_volume(kg, sd, ray);
-
-  if (step_size != FLT_MAX)
-    return kernel_volume_integrate_heterogeneous_distance(
-        kg, state, ray, sd, L, throughput, step_size);
-  else
-    return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
-}
-
-#  ifndef __SPLIT_KERNEL__
-/* Decoupled Volume Sampling
- *
- * VolumeSegment is list of coefficients and transmittance stored at all steps
- * through a volume. This can then later be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media"
- *
- * On the GPU this is only supported (but currently not enabled)
- * for homogeneous volumes (1 step), due to
- * no support for malloc/free and too much stack usage with a fix size array. */
-
-typedef struct VolumeStep {
-  float3 sigma_s;             /* scatter coefficient */
-  float3 sigma_t;             /* extinction coefficient */
-  float3 accum_transmittance; /* accumulated transmittance including this step */
-  float3 cdf_distance;        /* cumulative density function for distance sampling */
-  float t;                    /* distance at end of this step */
-  float shade_t;              /* jittered distance where shading was done in step */
-  int closure_flag;           /* shader evaluation closure flags */
-} VolumeStep;
-
-typedef struct VolumeSegment {
-  VolumeStep stack_step; /* stack storage for homogeneous step, to avoid malloc */
-  VolumeStep *steps;     /* recorded steps */
-  int numsteps;          /* number of steps */
-  int closure_flag;      /* accumulated closure flags from all steps */
-
-  float3 accum_emission;      /* accumulated emission at end of segment */
-  float3 accum_transmittance; /* accumulated transmittance at end of segment */
-  float3 accum_albedo;        /* accumulated average albedo over segment */
-
-  int sampling_method; /* volume sampling method */
-} VolumeSegment;
-
-/* record volume steps to the end of the volume.
- *
- * it would be nice if we could only record up to the point that we need to scatter,
- * but the entire segment is needed to do always scattering, rather than probabilistically
- * hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratified distance samples up to that transmittance */
-#    ifdef __VOLUME_DECOUPLED__
-ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
-                                               PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               VolumeSegment *segment,
-                                               const float object_step_size)
-{
-  /* prepare for volume stepping */
-  int max_steps;
-  float step_size, step_shade_offset, steps_offset;
-
-  if (object_step_size != FLT_MAX) {
-    max_steps = kernel_data.integrator.volume_max_steps;
-    kernel_volume_step_init(
-        kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-#      ifdef __KERNEL_CPU__
-    /* NOTE: For the branched path tracing it's possible to have direct
-     * and indirect light integration both having volume segments allocated.
-     * We detect this using index in the pre-allocated memory. Currently we
-     * only support two segments allocated at a time, if more needed some
-     * modifications to the KernelGlobals will be needed.
-     *
-     * This gives us restrictions that decoupled record should only happen
-     * in the stack manner, meaning if there's subsequent call of decoupled
-     * record it'll need to free memory before its caller frees memory.
-     */
-    const int index = kg->decoupled_volume_steps_index;
-    assert(index < sizeof(kg->decoupled_volume_steps) / sizeof(*kg->decoupled_volume_steps));
-    if (kg->decoupled_volume_steps[index] == NULL) {
-      kg->decoupled_volume_steps[index] = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-    }
-    segment->steps = kg->decoupled_volume_steps[index];
-    ++kg->decoupled_volume_steps_index;
-#      else
-    segment->steps = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-#      endif
-  }
-  else {
-    max_steps = 1;
-    step_size = ray->t;
-    step_shade_offset = 0.0f;
-    steps_offset = 1.0f;
-    segment->steps = &segment->stack_step;
-  }
-
-  /* init accumulation variables */
-  float3 accum_emission = zero_float3();
-  float3 accum_transmittance = one_float3();
-  float3 accum_albedo = zero_float3();
-  float3 cdf_distance = zero_float3();
-  float t = 0.0f;
-
-  segment->numsteps = 0;
-  segment->closure_flag = 0;
-  bool is_last_step_empty = false;
-
-  VolumeStep *step = segment->steps;
-
-  for (int i = 0; i < max_steps; i++, step++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 sigma_t = coeff.sigma_t;
-
-      /* compute average albedo for channel sampling */
-      if (closure_flag & SD_SCATTER) {
-        accum_albedo += (dt / ray->t) * safe_divide_color(coeff.sigma_s, sigma_t);
-      }
-
-      /* compute accumulated transmittance */
-      float3 transmittance = volume_color_transmittance(sigma_t, dt);
-
-      /* compute emission attenuated by absorption */
-      if (closure_flag & SD_EMISSION) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        accum_emission += accum_transmittance * emission;
-      }
-
-      accum_transmittance *= transmittance;
-
-      /* compute pdf for distance sampling */
-      float3 pdf_distance = dt * accum_transmittance * coeff.sigma_s;
-      cdf_distance = cdf_distance + pdf_distance;
-
-      /* write step data */
-      step->sigma_t = sigma_t;
-      step->sigma_s = coeff.sigma_s;
-      step->closure_flag = closure_flag;
-
-      segment->closure_flag |= closure_flag;
-
-      is_last_step_empty = false;
-      segment->numsteps++;
-    }
-    else {
-      if (is_last_step_empty) {
-        /* consecutive empty step, merge */
-        step--;
-      }
-      else {
-        /* store empty step */
-        step->sigma_t = zero_float3();
-        step->sigma_s = zero_float3();
-        step->closure_flag = 0;
-
-        segment->numsteps++;
-        is_last_step_empty = true;
-      }
-    }
-
-    step->accum_transmittance = accum_transmittance;
-    step->cdf_distance = cdf_distance;
-    step->t = new_t;
-    step->shade_t = t + dt * step_shade_offset;
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-
-    /* stop if nearly all light blocked */
-    if (accum_transmittance.x < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.y < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.z < VOLUME_THROUGHPUT_EPSILON)
-      break;
-  }
-
-  /* store total emission and transmittance */
-  segment->accum_emission = accum_emission;
-  segment->accum_transmittance = accum_transmittance;
-  segment->accum_albedo = accum_albedo;
-
-  /* normalize cumulative density function for distance sampling */
-  VolumeStep *last_step = segment->steps + segment->numsteps - 1;
-
-  if (!is_zero(last_step->cdf_distance)) {
-    VolumeStep *step = &segment->steps[0];
-    int numsteps = segment->numsteps;
-    float3 inv_cdf_distance_sum = safe_invert_color(last_step->cdf_distance);
-
-    for (int i = 0; i < numsteps; i++, step++)
-      step->cdf_distance *= inv_cdf_distance_sum;
-  }
-}
-
-ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
-{
-  if (segment->steps != &segment->stack_step) {
-#      ifdef __KERNEL_CPU__
-    /* NOTE: We only allow free last allocated segment.
-     * No random order of alloc/free is supported.
-     */
-    assert(kg->decoupled_volume_steps_index > 0);
-    assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]);
-    --kg->decoupled_volume_steps_index;
-#      else
-    free(segment->steps);
-#      endif
-  }
-}
-#    endif /* __VOLUME_DECOUPLED__ */
-
-/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching.
- *
- * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
-ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(KernelGlobals *kg,
-                                                                 PathState *state,
-                                                                 Ray *ray,
-                                                                 ShaderData *sd,
-                                                                 float3 *throughput,
-                                                                 float rphase,
-                                                                 float rscatter,
-                                                                 const VolumeSegment *segment,
-                                                                 const float3 *light_P,
-                                                                 bool probalistic_scatter)
-{
-  kernel_assert(segment->closure_flag & SD_SCATTER);
-
-  /* Sample color channel, use MIS with balance heuristic. */
-  float3 channel_pdf;
-  int channel = kernel_volume_sample_channel(
-      segment->accum_albedo, *throughput, rphase, &channel_pdf);
-
-  float xi = rscatter;
-
-  /* probabilistic scattering decision based on transmittance */
-  if (probalistic_scatter) {
-    float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
-
-    if (1.0f - xi >= sample_transmittance) {
-      /* rescale random number so we can reuse it */
-      xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-    }
-    else {
-      *throughput /= sample_transmittance;
-      return VOLUME_PATH_MISSED;
-    }
-  }
-
-  VolumeStep *step;
-  float3 transmittance;
-  float pdf, sample_t;
-  float mis_weight = 1.0f;
-  bool distance_sample = true;
-  bool use_mis = false;
-
-  if (segment->sampling_method && light_P) {
-    if (segment->sampling_method == SD_VOLUME_MIS) {
-      /* multiple importance sample: randomly pick between
-       * equiangular and distance sampling strategy */
-      if (xi < 0.5f) {
-        xi *= 2.0f;
-      }
-      else {
-        xi = (xi - 0.5f) * 2.0f;
-        distance_sample = false;
-      }
-
-      use_mis = true;
-    }
-    else {
-      /* only equiangular sampling */
-      distance_sample = false;
-    }
-  }
-
-  /* distance sampling */
-  if (distance_sample) {
-    /* find step in cdf */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float prev_cdf = 0.0f;
-      float step_cdf = 1.0f;
-      float3 prev_cdf_distance = zero_float3();
-
-      for (int i = 0;; i++, step++) {
-        /* todo: optimize using binary search */
-        step_cdf = kernel_volume_channel_get(step->cdf_distance, channel);
-
-        if (xi < step_cdf || i == segment->numsteps - 1)
-          break;
-
-        prev_cdf = step_cdf;
-        prev_t = step->t;
-        prev_cdf_distance = step->cdf_distance;
-      }
-
-      /* remap xi so we can reuse it */
-      xi = (xi - prev_cdf) / (step_cdf - prev_cdf);
-
-      /* pdf for picking step */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-
-    /* sample distance and compute transmittance */
-    float3 distance_pdf;
-    sample_t = prev_t + kernel_volume_distance_sample(
-                            step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-
-    /* modify pdf for hit/miss decision */
-    if (probalistic_scatter)
-      distance_pdf *= one_float3() - segment->accum_transmittance;
-
-    pdf = dot(channel_pdf, distance_pdf * step_pdf_distance);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
-      mis_weight = 2.0f * power_heuristic(pdf, equi_pdf);
-    }
-  }
-  /* equi-angular sampling */
-  else {
-    /* sample distance */
-    sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
-
-    /* find step in which sampled distance is located */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float3 prev_cdf_distance = zero_float3();
-
-      int numsteps = segment->numsteps;
-      int high = numsteps - 1;
-      int low = 0;
-      int mid;
-
-      while (low < high) {
-        mid = (low + high) >> 1;
-
-        if (sample_t < step[mid].t)
-          high = mid;
-        else if (sample_t >= step[mid + 1].t)
-          low = mid + 1;
-        else {
-          /* found our interval in step[mid] .. step[mid+1] */
-          prev_t = step[mid].t;
-          prev_cdf_distance = step[mid].cdf_distance;
-          step += mid + 1;
-          break;
-        }
-      }
-
-      if (low >= numsteps - 1) {
-        prev_t = step[numsteps - 1].t;
-        prev_cdf_distance = step[numsteps - 1].cdf_distance;
-        step += numsteps - 1;
-      }
-
-      /* pdf for picking step with distance sampling */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-    float step_sample_t = sample_t - prev_t;
-
-    /* compute transmittance */
-    transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
-      float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance);
-      mis_weight = 2.0f * power_heuristic(pdf, distance_pdf);
-    }
-  }
-  if (sample_t < 0.0f || pdf == 0.0f) {
-    return VOLUME_PATH_MISSED;
-  }
-
-  /* compute transmittance up to this step */
-  if (step != segment->steps)
-    transmittance *= (step - 1)->accum_transmittance;
-
-  /* modify throughput */
-  *throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
-
-  /* evaluate shader to create closures at shading point */
-  if (segment->numsteps > 1) {
-    sd->P = ray->P + step->shade_t * ray->D;
-
-    VolumeShaderCoefficients coeff;
-    volume_shader_sample(kg, sd, state, sd->P, &coeff);
-  }
-
-  /* move to new position */
-  sd->P = ray->P + sample_t * ray->D;
-
-  return VOLUME_PATH_SCATTERED;
-}
-#  endif /* __SPLIT_KERNEL */
-
-/* decide if we need to use decoupled or not */
-ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg,
-                                            bool heterogeneous,
-                                            bool direct,
-                                            int sampling_method)
-{
-  /* decoupled ray marching for heterogeneous volumes not supported on the GPU,
-   * which also means equiangular and multiple importance sampling is not
-   * support for that case */
-  if (!kernel_data.integrator.volume_decoupled)
-    return false;
-
-#  ifdef __KERNEL_GPU__
-  if (heterogeneous)
-    return false;
-#  endif
-
-  /* equiangular and multiple importance sampling only implemented for decoupled */
-  if (sampling_method != 0)
-    return true;
-
-  /* for all light sampling use decoupled, reusing shader evaluations is
-   * typically faster in that case */
-  if (direct)
-    return kernel_data.integrator.sample_all_lights_direct;
-  else
-    return kernel_data.integrator.sample_all_lights_indirect;
-}
-
-/* Volume Stack
- *
- * This is an array of object/shared ID's that the current segment of the path
- * is inside of. */
-
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
-                                         ShaderData *stack_sd,
-                                         ccl_addr_space const PathState *state,
-                                         ccl_addr_space const Ray *ray,
-                                         ccl_addr_space VolumeStack *stack)
-{
-  /* NULL ray happens in the baker, does it need proper initialization of
-   * camera in volume?
-   */
-  if (!kernel_data.cam.is_inside_volume || ray == NULL) {
-    /* Camera is guaranteed to be in the air, only take background volume
-     * into account in this case.
-     */
-    if (kernel_data.background.volume_shader != SHADER_NONE) {
-      stack[0].shader = kernel_data.background.volume_shader;
-      stack[0].object = PRIM_NONE;
-      stack[1].shader = SHADER_NONE;
-    }
-    else {
-      stack[0].shader = SHADER_NONE;
-    }
-    return;
-  }
-
-  kernel_assert(state->flag & PATH_RAY_CAMERA);
-
-  Ray volume_ray = *ray;
-  volume_ray.t = FLT_MAX;
-
-  const uint visibility = (state->flag & PATH_RAY_ALL_VISIBILITY);
-  int stack_index = 0, enclosed_index = 0;
-
-#  ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
-  if (num_hits > 0) {
-    int enclosed_volumes[VOLUME_STACK_SIZE];
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      if (stack_sd->flag & SD_BACKFACING) {
-        bool need_add = true;
-        for (int i = 0; i < enclosed_index && need_add; ++i) {
-          /* If ray exited the volume and never entered to that volume
-           * it means that camera is inside such a volume.
-           */
-          if (enclosed_volumes[i] == stack_sd->object) {
-            need_add = false;
-          }
-        }
-        for (int i = 0; i < stack_index && need_add; ++i) {
-          /* Don't add intersections twice. */
-          if (stack[i].object == stack_sd->object) {
-            need_add = false;
-            break;
-          }
-        }
-        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
-          stack[stack_index].object = stack_sd->object;
-          stack[stack_index].shader = stack_sd->shader;
-          ++stack_index;
-        }
-      }
-      else {
-        /* If ray from camera enters the volume, this volume shouldn't
-         * be added to the stack on exit.
-         */
-        enclosed_volumes[enclosed_index++] = stack_sd->object;
-      }
-    }
-  }
-#  else
-  int enclosed_volumes[VOLUME_STACK_SIZE];
-  int step = 0;
-
-  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
-         step < 2 * VOLUME_STACK_SIZE) {
-    Intersection isect;
-    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
-      break;
-    }
-
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    if (stack_sd->flag & SD_BACKFACING) {
-      /* If ray exited the volume and never entered to that volume
-       * it means that camera is inside such a volume.
-       */
-      bool need_add = true;
-      for (int i = 0; i < enclosed_index && need_add; ++i) {
-        /* If ray exited the volume and never entered to that volume
-         * it means that camera is inside such a volume.
-         */
-        if (enclosed_volumes[i] == stack_sd->object) {
-          need_add = false;
-        }
-      }
-      for (int i = 0; i < stack_index && need_add; ++i) {
-        /* Don't add intersections twice. */
-        if (stack[i].object == stack_sd->object) {
-          need_add = false;
-          break;
-        }
-      }
-      if (need_add) {
-        stack[stack_index].object = stack_sd->object;
-        stack[stack_index].shader = stack_sd->shader;
-        ++stack_index;
-      }
-    }
-    else {
-      /* If ray from camera enters the volume, this volume shouldn't
-       * be added to the stack on exit.
-       */
-      enclosed_volumes[enclosed_index++] = stack_sd->object;
-    }
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    ++step;
-  }
-#  endif
-  /* stack_index of 0 means quick checks outside of the kernel gave false
-   * positive, nothing to worry about, just we've wasted quite a few of
-   * ticks just to come into conclusion that camera is in the air.
-   *
-   * In this case we're doing the same above -- check whether background has
-   * volume.
-   */
-  if (stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
-    stack[0].shader = kernel_data.background.volume_shader;
-    stack[0].object = OBJECT_NONE;
-    stack[1].shader = SHADER_NONE;
-  }
-  else {
-    stack[stack_index].shader = SHADER_NONE;
-  }
-}
-
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ccl_addr_space VolumeStack *stack)
-{
-  /* todo: we should have some way for objects to indicate if they want the
-   * world shader to work inside them. excluding it by default is problematic
-   * because non-volume objects can't be assumed to be closed manifolds */
-
-  if (!(sd->flag & SD_HAS_VOLUME))
-    return;
-
-  if (sd->flag & SD_BACKFACING) {
-    /* exit volume object: remove from stack */
-    for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-      if (stack[i].object == sd->object) {
-        /* shift back next stack entries */
-        do {
-          stack[i] = stack[i + 1];
-          i++;
-        } while (stack[i].shader != SHADER_NONE);
-
-        return;
-      }
-    }
-  }
-  else {
-    /* enter volume object: add to stack */
-    int i;
-
-    for (i = 0; stack[i].shader != SHADER_NONE; i++) {
-      /* already in the stack? then we have nothing to do */
-      if (stack[i].object == sd->object)
-        return;
-    }
-
-    /* if we exceed the stack limit, ignore */
-    if (i >= VOLUME_STACK_SIZE - 1)
-      return;
-
-    /* add to the end of the stack */
-    stack[i].shader = sd->shader;
-    stack[i].object = sd->object;
-    stack[i + 1].shader = SHADER_NONE;
-  }
-}
-
-#  ifdef __SUBSURFACE__
-ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
-                                                          ShaderData *stack_sd,
-                                                          Ray *ray,
-                                                          ccl_addr_space VolumeStack *stack)
-{
-  kernel_assert(kernel_data.integrator.use_volumes);
-
-  Ray volume_ray = *ray;
-
-#    ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
-  if (num_hits > 0) {
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-    }
-  }
-#    else
-  Intersection isect;
-  int step = 0;
-  float3 Pend = ray->P + ray->D * ray->t;
-  while (step < 2 * VOLUME_STACK_SIZE &&
-         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    if (volume_ray.t != FLT_MAX) {
-      volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t);
-    }
-    ++step;
-  }
-#    endif
-}
-#  endif
-
-/* Clean stack after the last bounce.
- *
- * It is expected that all volumes are closed manifolds, so at the time when ray
- * hits nothing (for example, it is a last bounce which goes to environment) the
- * only expected volume in the stack is the world's one. All the rest volume
- * entries should have been exited already.
- *
- * This isn't always true because of ray intersection precision issues, which
- * could lead us to an infinite non-world volume in the stack, causing render
- * artifacts.
- *
- * Use this function after the last bounce to get rid of all volumes apart from
- * the world's one after the last bounce to avoid render artifacts.
- */
-ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
-                                                 ccl_addr_space VolumeStack *volume_stack)
-{
-  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    /* Keep the world's volume in stack. */
-    volume_stack[1].shader = SHADER_NONE;
-  }
-  else {
-    volume_stack[0].shader = SHADER_NONE;
-  }
-}
-
-#endif /* __VOLUME__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index d1602744f1d..fab0915c38e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_WORK_STEALING_H__
-#define __KERNEL_WORK_STEALING_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN
  */
 
 /* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       uint global_work_index,
                                       ccl_private uint *x,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
+#if 0
+  /* Keep threads for the same sample together. */
   uint tile_pixels = tile->w * tile->h;
   uint sample_offset = global_work_index / tile_pixels;
   uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
+#else
+  /* Keeping threads for the same pixel together.
+   * Appears to improve performance by a few % on CUDA and OptiX. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
 
@@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
   *sample = tile->start_sample + sample_offset;
 }
 
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#endif
-
-#ifdef __SPLIT_KERNEL__
-/* Returns true if there is work */
-ccl_device bool get_next_work_item(KernelGlobals *kg,
-                                   ccl_global uint *work_pools,
-                                   uint total_work_size,
-                                   uint ray_index,
-                                   ccl_private uint *global_work_index)
-{
-  /* With a small amount of work there may be more threads than work due to
-   * rounding up of global size, stop such threads immediately. */
-  if (ray_index >= total_work_size) {
-    return false;
-  }
-
-  /* Increase atomic work index counter in pool. */
-  uint pool = ray_index / WORK_POOL_SIZE;
-  uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
-
-  /* Map per-pool work index to a global work index. */
-  uint global_size = ccl_global_size(0) * ccl_global_size(1);
-  kernel_assert(global_size % WORK_POOL_SIZE == 0);
-  kernel_assert(ray_index < global_size);
-
-  *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
-                       (work_index % WORK_POOL_SIZE);
-
-  /* Test if all work for this pool is done. */
-  return (*global_work_index < total_work_size);
-}
-
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
-{
-  bool got_work = false;
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    do {
-      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-      if (got_work) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                       kernel_data.film.pass_adaptive_aux_buffer);
-        if ((*aux).w == 0.0f) {
-          break;
-        }
-      }
-    } while (got_work);
-  }
-  else {
-    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-  }
-  return got_work;
-}
-#endif
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel_write_passes.h b/intern/cycles/kernel/kernel_write_passes.h
index 410218d91d4..9d379495629 100644
--- a/intern/cycles/kernel/kernel_write_passes.h
+++ b/intern/cycles/kernel/kernel_write_passes.h
@@ -14,23 +14,25 @@
  * limitations under the License.
  */
 
-#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#pragma once
+
+#ifdef __KERNEL_GPU__
 #  define __ATOMIC_PASS_WRITE__
 #endif
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
+ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
 {
-  ccl_global float *buf = buffer;
 #ifdef __ATOMIC_PASS_WRITE__
-  atomic_add_and_fetch_float(buf, value);
+  atomic_add_and_fetch_float(buffer, value);
 #else
-  *buf += value;
+  *buffer += value;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer,
+                                                float3 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -41,12 +43,14 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3
   atomic_add_and_fetch_float(buf_y, value.y);
   atomic_add_and_fetch_float(buf_z, value.z);
 #else
-  ccl_global float3 *buf = (ccl_global float3 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer,
+                                                float4 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -59,37 +63,26 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4
   atomic_add_and_fetch_float(buf_z, value.z);
   atomic_add_and_fetch_float(buf_w, value.w);
 #else
-  ccl_global float4 *buf = (ccl_global float4 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
+  buffer[3] += value.w;
 #endif
 }
 
-#ifdef __DENOISING_FEATURES__
-ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+ccl_device_inline float kernel_read_pass_float(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float(buffer, value);
-
-  /* The online one-pass variance update that's used for the megakernel can't easily be implemented
-   * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
-  kernel_write_pass_float(buffer + 1, value * value);
+  return *buffer;
 }
 
-#  ifdef __ATOMIC_PASS_WRITE__
-#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
-#  else
-ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
 {
-  buffer[0] += value.x;
-  buffer[1] += value.y;
-  buffer[2] += value.z;
+  return make_float3(buffer[0], buffer[1], buffer[2]);
 }
-#  endif
 
-ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float3_unaligned(buffer, value);
-  kernel_write_pass_float3_unaligned(buffer + 3, value * value);
+  return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
 }
-#endif /* __DENOISING_FEATURES__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
deleted file mode 100644
index 145a6b6ac40..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
deleted file mode 100644
index 012daba62d8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
deleted file mode 100644
index 16351a7f949..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
deleted file mode 100644
index 1423b182ab8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleV,
-                                                     float *sampleVV,
-                                                     float *bufferV,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tiles,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample);
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
deleted file mode 100644
index 3d4cb87e104..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-#include "kernel/kernel_compat_cpu.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-#ifdef KERNEL_STUB
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* Denoise filter */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleVariance,
-                                                     float *sampleVarianceV,
-                                                     float *bufferVariance,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
-#else
-  kernel_filter_divide_shadow(sample,
-                              tile_info,
-                              x,
-                              y,
-                              unfilteredA,
-                              unfilteredB,
-                              sampleVariance,
-                              sampleVarianceV,
-                              bufferVariance,
-                              load_int4(prefilter_rect),
-                              buffer_pass_stride,
-                              buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
-#else
-  kernel_filter_get_feature(sample,
-                            tile_info,
-                            m_offset,
-                            v_offset,
-                            x,
-                            y,
-                            mean,
-                            variance,
-                            scale,
-                            load_int4(prefilter_rect),
-                            buffer_pass_stride,
-                            buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
-#else
-  kernel_filter_write_feature(
-      sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
-#else
-  kernel_filter_detect_outliers(
-      x, y, image, variance, depth, output, load_int4(rect), pass_stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
-#else
-  kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tile_info,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *prefilter_rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
-#else
-  rank += storage_ofs;
-  transform += storage_ofs * TRANSFORM_SIZE;
-  kernel_filter_construct_transform(buffer,
-                                    tile_info,
-                                    x,
-                                    y,
-                                    load_int4(prefilter_rect),
-                                    pass_stride,
-                                    frame_stride,
-                                    use_time,
-                                    transform,
-                                    rank,
-                                    radius,
-                                    pca_threshold);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
-#else
-  kernel_filter_nlm_calc_difference(dx,
-                                    dy,
-                                    weight_image,
-                                    variance_image,
-                                    scale_image,
-                                    difference_image,
-                                    load_int4(rect),
-                                    stride,
-                                    channel_offset,
-                                    frame_offset,
-                                    a,
-                                    k_2);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
-#else
-  kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
-#else
-  kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
-#else
-  kernel_filter_nlm_update_output(dx,
-                                  dy,
-                                  difference_image,
-                                  image,
-                                  temp_image,
-                                  out_image,
-                                  accum_image,
-                                  load_int4(rect),
-                                  channel_offset,
-                                  stride,
-                                  f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
-#else
-  kernel_filter_nlm_construct_gramian(dx,
-                                      dy,
-                                      t,
-                                      difference_image,
-                                      buffer,
-                                      transform,
-                                      rank,
-                                      XtWX,
-                                      XtWY,
-                                      load_int4(rect),
-                                      load_int4(filter_window),
-                                      stride,
-                                      f,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
-#else
-  kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_finalize);
-#else
-  XtWX += storage_ofs * XTWX_SIZE;
-  XtWY += storage_ofs * XTWY_SIZE;
-  rank += storage_ofs;
-  kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
-#endif
-}
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
deleted file mode 100644
index 75833d83648..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
deleted file mode 100644
index c998cd54d3a..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
deleted file mode 100644
index fc4ef1fca5b..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
deleted file mode 100644
index ea3103f12c3..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample);
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-/* Split kernels */
-
-void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg,
-                                          ccl_constant KernelData *data,
-                                          ccl_global void *split_data_buffer,
-                                          int num_elements,
-                                          ccl_global char *ray_state,
-                                          int start_sample,
-                                          int end_sample,
-                                          int sx,
-                                          int sy,
-                                          int sw,
-                                          int sh,
-                                          int offset,
-                                          int stride,
-                                          ccl_global int *Queue_index,
-                                          int queuesize,
-                                          ccl_global char *use_queues_flag,
-                                          ccl_global unsigned int *work_pool_wgs,
-                                          unsigned int num_samples,
-                                          ccl_global float *buffer);
-
-#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * data);
-
-DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
-DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
-DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
-DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
deleted file mode 100644
index 51d6c23f72f..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-
-#ifndef KERNEL_STUB
-#  ifndef __SPLIT_KERNEL__
-#    include "kernel/kernel_math.h"
-#    include "kernel/kernel_types.h"
-
-#    include "kernel/split/kernel_split_data.h"
-#    include "kernel/kernel_globals.h"
-
-#    include "kernel/kernel_color.h"
-#    include "kernel/kernels/cpu/kernel_cpu_image.h"
-#    include "kernel/kernel_film.h"
-#    include "kernel/kernel_path.h"
-#    include "kernel/kernel_path_branched.h"
-#    include "kernel/kernel_bake.h"
-#  else
-#    include "kernel/split/kernel_split_common.h"
-
-#    include "kernel/split/kernel_data_init.h"
-#    include "kernel/split/kernel_path_init.h"
-#    include "kernel/split/kernel_scene_intersect.h"
-#    include "kernel/split/kernel_lamp_emission.h"
-#    include "kernel/split/kernel_do_volume.h"
-#    include "kernel/split/kernel_queue_enqueue.h"
-#    include "kernel/split/kernel_indirect_background.h"
-#    include "kernel/split/kernel_shader_setup.h"
-#    include "kernel/split/kernel_shader_sort.h"
-#    include "kernel/split/kernel_shader_eval.h"
-#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#    include "kernel/split/kernel_subsurface_scatter.h"
-#    include "kernel/split/kernel_direct_lighting.h"
-#    include "kernel/split/kernel_shadow_blocked_ao.h"
-#    include "kernel/split/kernel_shadow_blocked_dl.h"
-#    include "kernel/split/kernel_enqueue_inactive.h"
-#    include "kernel/split/kernel_next_iteration_setup.h"
-#    include "kernel/split/kernel_indirect_subsurface.h"
-#    include "kernel/split/kernel_buffer_update.h"
-#    include "kernel/split/kernel_adaptive_stopping.h"
-#    include "kernel/split/kernel_adaptive_filter_x.h"
-#    include "kernel/split/kernel_adaptive_filter_y.h"
-#    include "kernel/split/kernel_adaptive_adjust_samples.h"
-#  endif /* __SPLIT_KERNEL__ */
-#else
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-
-#  ifdef __SPLIT_KERNEL__
-#    include "kernel/split/kernel_data_init.h"
-#  endif /* __SPLIT_KERNEL__ */
-#endif   /* KERNEL_STUB */
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __SPLIT_KERNEL__
-
-/* Path Tracing */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, path_trace);
-#  else
-#    ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched) {
-    kernel_branched_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-  else
-#    endif
-  {
-    kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-/* Film */
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
-#  else
-  kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
-#  else
-  kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-/* Bake */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, bake);
-#  else
-#    ifdef __BAKING__
-  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#    endif
-#  endif /* KERNEL_STUB */
-}
-
-/* Shader Evaluate */
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, shader);
-#  else
-  if (type == SHADER_EVAL_DISPLACE) {
-    kernel_displace_evaluate(kg, input, output, i);
-  }
-  else {
-    kernel_background_evaluate(kg, input, output, i);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-#else /* __SPLIT_KERNEL__ */
-
-/* Split Kernel Path Tracing */
-
-#  ifdef KERNEL_STUB
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-#  else
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        kernel_##name(kg); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        ccl_local type locals; \
-        kernel_##name(kg, &locals); \
-      }
-#  endif /* KERNEL_STUB */
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao,
-                                    BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-#endif   /* __SPLIT_KERNEL__ */
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
deleted file mode 100644
index 989f5e5aaa8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-#define __SPLIT_KERNEL__
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
deleted file mode 100644
index 40e485d27c0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
deleted file mode 100644
index 8c44238470e..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
deleted file mode 100644
index 7a3f218d5fc..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
deleted file mode 100644
index 1cab59e0ea0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
deleted file mode 100644
index 637126d9d4c..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
deleted file mode 100644
index 6c9642d1f03..00000000000
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel_config.h"
-
-#include "kernel/kernel_compat_cuda.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_copy_input(float *buffer,
-                              CCL_FILTER_TILE_INFO,
-                              int4 prefilter_rect,
-                              int buffer_pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-		int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-		int itile = ytile * 3 + xtile;
-		float *const in = ((float *)ccl_get_tile_buffer(itile)) +
-			(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
-		buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
-		for (int i = 0; i < buffer_pass_stride; ++i)
-			buffer[i] = in[i];
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		if (num_inputs > 0) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3;
-			out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
-			out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
-			out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
-		}
-		if (num_inputs > 1) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-		if (num_inputs > 2) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
-		float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
-		out[0] = in[0] * num_samples;
-		out[1] = in[1] * num_samples;
-		out[2] = in[2] * num_samples;
-	}
-}
-
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_divide_shadow(int sample,
-                                 CCL_FILTER_TILE_INFO,
-                                 float *unfilteredA,
-                                 float *unfilteredB,
-                                 float *sampleVariance,
-                                 float *sampleVarianceV,
-                                 float *bufferVariance,
-                                 int4 prefilter_rect,
-                                 int buffer_pass_stride,
-                                 int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            tile_info,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_get_feature(int sample,
-                               CCL_FILTER_TILE_INFO,
-                               int m_offset,
-                               int v_offset,
-                               float *mean,
-                               float *variance,
-                               float scale,
-                               int4 prefilter_rect,
-                               int buffer_pass_stride,
-                               int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          tile_info,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_write_feature(int sample,
-                                 int4 buffer_params,
-                                 int4 filter_area,
-                                 float *from,
-                                 float *buffer,
-                                 int out_offset,
-                                 int4 prefilter_rect)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-	                                x + filter_area.x,
-	                                y + filter_area.y,
-	                                buffer_params,
-	                                from,
-	                                buffer,
-	                                out_offset,
-	                                prefilter_rect);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_detect_outliers(float *image,
-                                   float *variance,
-                                   float *depth,
-                                   float *output,
-                                   int4 prefilter_rect,
-                                   int pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
-                                       CCL_FILTER_TILE_INFO,
-                                       float *transform, int *rank,
-                                       int4 filter_area, int4 rect,
-                                       int radius, float pca_threshold,
-                                       int pass_stride, int frame_stride,
-                                       bool use_time)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int *l_rank = rank + y*filter_area.z + x;
-		float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  tile_info,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
-                                       const float *ccl_restrict variance_image,
-                                       const float *ccl_restrict scale_image,
-                                       float *difference_image,
-                                       int w,
-                                       int h,
-                                       int stride,
-                                       int pass_stride,
-                                       int r,
-                                       int channel_offset,
-                                       int frame_offset,
-                                       float a,
-                                       float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
-                            float *out_image,
-                            int w,
-                            int h,
-                            int stride,
-                            int pass_stride,
-                            int r,
-                            int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                   float *out_image,
-                                   int w,
-                                   int h,
-                                   int stride,
-                                   int pass_stride,
-                                   int r,
-                                   int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
-                                     const float *ccl_restrict image,
-                                     float *out_image,
-                                     float *accum_image,
-                                     int w,
-                                     int h,
-                                     int stride,
-                                     int pass_stride,
-                                     int channel_offset,
-                                     int r,
-                                     int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image,
-                                 const float *ccl_restrict accum_image,
-                                 int w,
-                                 int h,
-                                 int stride)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int t,
-                                         const float *ccl_restrict difference_image,
-                                         const float *ccl_restrict buffer,
-                                         float const* __restrict__ transform,
-                                         int *rank,
-                                         float *XtWX,
-                                         float3 *XtWY,
-                                         int4 filter_window,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f,
-                                         int frame_offset,
-                                         bool use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(float *buffer,
-                            int *rank,
-                            float *XtWX,
-                            float3 *XtWY,
-                            int4 filter_area,
-                            int4 buffer_params,
-                            int sample)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
deleted file mode 100644
index cf62b6e781e..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/cuda/kernel_cuda_image.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-
-#ifdef __BRANCHED_PATH__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-	
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-#endif
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	KernelGlobals kg;
-	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
-		uint x = tile->x + work_index % tile->w;
-		uint y = tile->y + work_index / tile->w;
-		int index = tile->offset + x + y * tile->stride;
-		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-		kernel_do_adaptive_stopping(&kg, buffer, sample);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->h) {
-			int y = tile->y + ccl_global_id(0);
-			kernel_do_adaptive_filter_x(&kg, y, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->w) {
-			int x = tile->x + ccl_global_id(0);
-			kernel_do_adaptive_filter_y(&kg, x, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
-{
-	if(kernel_data.film.pass_adaptive_aux_buffer) {
-		int work_index = ccl_global_id(0);
-		bool thread_is_active = work_index < total_work_size;
-		KernelGlobals kg;
-		if(thread_is_active) {
-			uint x = tile->x + work_index % tile->w;
-			uint y = tile->y + work_index / tile->w;
-			int index = tile->offset + x + y * tile->stride;
-			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-				float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-				if(sample_multiplier != 1.0f) {
-					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
-				}
-			}
-			else {
-				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
-			}
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_displace(uint4 *input,
-                     float4 *output,
-                     int type,
-                     int sx,
-                     int sw,
-                     int offset,
-                     int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_displace_evaluate(&kg, input, output, x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_background(uint4 *input,
-                       float4 *output,
-                       int type,
-                       int sx,
-                       int sw,
-                       int offset,
-                       int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_background_evaluate(&kg, input, output, x);
-	}
-}
-
-#ifdef __BAKING__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		KernelGlobals kg;
-		kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-}
-#endif
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
deleted file mode 100644
index 2e47ce2de6c..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* device data taken from CUDA occupancy calculator */
-
-/* 3.0 and 3.5 */
-#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.x, 6.x */
-#elif __CUDA_ARCH__ <= 699
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
- * registers */
-#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
-#    define CUDA_KERNEL_MAX_REGISTERS 64
-#  else
-#    define CUDA_KERNEL_MAX_REGISTERS 48
-#  endif
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 7.x, 8.x */
-#elif __CUDA_ARCH__ <= 899
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 64
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* For split kernel using all registers seems fastest for now, but this
- * is unlikely to be optimal once we resolve other bottlenecks. */
-
-#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
-
-/* Compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread. */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-  __launch_bounds__(threads_block_width *threads_block_width, \
-                    CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-                        (threads_block_width * threads_block_width * thread_num_registers))
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-        (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
-    CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
deleted file mode 100644
index 95ad7599cf1..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA split kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#define __SPLIT_KERNEL__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-#include "kernel/split/kernel_path_init.h"
-#include "kernel/split/kernel_scene_intersect.h"
-#include "kernel/split/kernel_lamp_emission.h"
-#include "kernel/split/kernel_do_volume.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-#include "kernel/split/kernel_indirect_background.h"
-#include "kernel/split/kernel_shader_setup.h"
-#include "kernel/split/kernel_shader_sort.h"
-#include "kernel/split/kernel_shader_eval.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-#include "kernel/split/kernel_direct_lighting.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
-#include "kernel/split/kernel_buffer_update.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#include "kernel/kernel_film.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
-{
-	*size = split_data_buffer_size(NULL, num_threads);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace_data_init(
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,
-        int queuesize,
-        ccl_global char *use_queues_flag,
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-        ccl_global float *buffer)
-{
-	kernel_data_init(NULL,
-	                 NULL,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		kernel_##name(NULL); \
-	}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		ccl_local type locals; \
-		kernel_##name(NULL, &locals); \
-	}
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
deleted file mode 100644
index 996bc27f71b..00000000000
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL kernel entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-__kernel void kernel_ocl_filter_divide_shadow(int sample,
-                                              CCL_FILTER_TILE_INFO,
-                                              ccl_global float *unfilteredA,
-                                              ccl_global float *unfilteredB,
-                                              ccl_global float *sampleVariance,
-                                              ccl_global float *sampleVarianceV,
-                                              ccl_global float *bufferVariance,
-                                              int4 prefilter_rect,
-                                              int buffer_pass_stride,
-                                              int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            CCL_FILTER_TILE_INFO_ARG,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_get_feature(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int m_offset,
-                                            int v_offset,
-                                            ccl_global float *mean,
-                                            ccl_global float *variance,
-                                            float scale,
-                                            int4 prefilter_rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          CCL_FILTER_TILE_INFO_ARG,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_write_feature(int sample,
-                                              int4 buffer_params,
-                                              int4 filter_area,
-                                              ccl_global float *from,
-                                              ccl_global float *buffer,
-                                              int out_offset,
-                                              int4 prefilter_rect)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-		                            x + filter_area.x,
-		                            y + filter_area.y,
-		                            buffer_params,
-		                            from,
-		                            buffer,
-		                            out_offset,
-		                            prefilter_rect);
-	}
-}
-
-__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
-                                                ccl_global float *variance,
-                                                ccl_global float *depth,
-                                                ccl_global float *output,
-                                                int4 prefilter_rect,
-                                                int pass_stride)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
-                                               ccl_global float *variance,
-                                               ccl_global float *a,
-                                               ccl_global float *b,
-                                               int4 prefilter_rect,
-                                               int r)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                    CCL_FILTER_TILE_INFO,
-                                                    ccl_global float *transform,
-                                                    ccl_global int *rank,
-                                                    int4 filter_area,
-                                                    int4 rect,
-                                                    int pass_stride,
-                                                    int frame_stride,
-                                                    char use_time,
-                                                    int radius,
-                                                    float pca_threshold)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		ccl_global int *l_rank = rank + y*filter_area.z + x;
-		ccl_global float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  CCL_FILTER_TILE_INFO_ARG,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
-                                                    const ccl_global float *ccl_restrict variance_image,
-                                                    const ccl_global float *ccl_restrict scale_image,
-                                                    ccl_global float *difference_image,
-                                                    int w,
-                                                    int h,
-                                                    int stride,
-                                                    int pass_stride,
-                                                    int r,
-                                                    int channel_offset,
-                                                    int frame_offset,
-                                                    float a,
-                                                    float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
-                                         ccl_global float *out_image,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
-                                                ccl_global float *out_image,
-                                                int w,
-                                                int h,
-                                                int stride,
-                                                int pass_stride,
-                                                int r,
-                                                int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
-                                                  const ccl_global float *ccl_restrict image,
-                                                  ccl_global float *out_image,
-                                                  ccl_global float *accum_image,
-                                                  int w,
-                                                  int h,
-                                                  int stride,
-                                                  int pass_stride,
-                                                  int channel_offset,
-                                                  int r,
-                                                  int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
-                                              const ccl_global float *ccl_restrict accum_image,
-                                              int w,
-                                              int h,
-                                              int stride)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
-                                                      const ccl_global float *ccl_restrict difference_image,
-                                                      const ccl_global float *ccl_restrict buffer,
-                                                      const ccl_global float *ccl_restrict transform,
-                                                      ccl_global int *rank,
-                                                      ccl_global float *XtWX,
-                                                      ccl_global float3 *XtWY,
-                                                      int4 filter_window,
-                                                      int w,
-                                                      int h,
-                                                      int stride,
-                                                      int pass_stride,
-                                                      int r,
-                                                      int f,
-                                                      int frame_offset,
-                                                      char use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
-                                         ccl_global int *rank,
-                                         ccl_global float *XtWX,
-                                         ccl_global float3 *XtWY,
-                                         int4 filter_area,
-                                         int4 buffer_params,
-                                         int sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
deleted file mode 100644
index ebdb99d4730..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#define KERNEL_NAME adaptive_adjust_samples
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
deleted file mode 100644
index 76d82d4184e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-
-#define KERNEL_NAME adaptive_filter_x
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
deleted file mode 100644
index 1e6d15ba0f2..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-
-#define KERNEL_NAME adaptive_filter_y
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
deleted file mode 100644
index 51de0059667..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-
-#define KERNEL_NAME adaptive_stopping
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_background.cl
deleted file mode 100644
index 0e600676e82..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background.cl
+++ /dev/null
@@ -1,35 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_background(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_background_evaluate(kg, input, output, x);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl b/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
deleted file mode 100644
index 7b81e387467..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_bake(
-	ccl_constant KernelData *data,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int sx, int sy, int sw, int sh, int offset, int stride, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh) {
-#ifndef __NO_BAKING__
-		kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#endif
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_base.cl b/intern/cycles/kernel/kernels/opencl/kernel_base.cl
deleted file mode 100644
index 1c2d89e8a92..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_base.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL base kernels entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-
-#include "kernel/kernel_film.h"
-
-
-__kernel void kernel_ocl_convert_to_byte(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_convert_to_half_float(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
-{
-	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-	if(i < size / sizeof(float4)) {
-		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	}
-	else if(i == size / sizeof(float4)) {
-		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
-
-		for(i = 0; i < size % sizeof(float4); i++) {
-			*(b++) = 0;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
deleted file mode 100644
index 7125348a49f..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-
-__kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-		KERNEL_BUFFER_PARAMS,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-        int queuesize,                               /* size (capacity) of the queue */
-        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-        ccl_global float *buffer)
-{
-	kernel_data_init((KernelGlobals*)kg,
-	                 data,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 KERNEL_BUFFER_ARGS,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl b/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
deleted file mode 100644
index 76cc36971f5..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_displace(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_displace_evaluate(kg, input, output, x);
-	}
-}
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
deleted file mode 100644
index 8b1332bf013..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-
-#define KERNEL_NAME next_iteration_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
deleted file mode 100644
index bb6b8a40e8e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright 2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_NANOVDB
-/* Data type to replace `double` used in the NanoVDB headers. Cycles don't need doubles, and is
- * safer and more portable to never use double datatype on GPU.
- * Use a special structure, so that the following is true:
- * - No unnoticed implicit cast or mathematical operations used on scalar 64bit type
- *   (which rules out trick like using `uint64_t` as a drop-in replacement for double).
- * - Padding rules are matching exactly `double`
- *   (which rules out array of `uint8_t`). */
-typedef struct ccl_vdb_double_t {
-  uint64_t i;
-} ccl_vdb_double_t;
-
-#  define double ccl_vdb_double_t
-#  include "nanovdb/CNanoVDB.h"
-#  undef double
-#endif
-
-/* For OpenCL we do manual lookup and interpolation. */
-
-ccl_device_inline ccl_global TextureInfo *kernel_tex_info(KernelGlobals *kg, uint id)
-{
-  const uint tex_offset = id
-#define KERNEL_TEX(type, name) +1
-#include "kernel/kernel_textures.h"
-      ;
-
-  return &((ccl_global TextureInfo *)kg->buffers[0])[tex_offset];
-}
-
-#define tex_fetch(type, info, index) \
-  ((ccl_global type *)(kg->buffers[info->cl_buffer] + info->data))[(index)]
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
-  x %= width;
-  if (x < 0)
-    x += width;
-  return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
-  return clamp(x, 0, width - 1);
-}
-
-ccl_device_inline float4 svm_image_texture_read(
-    KernelGlobals *kg, const ccl_global TextureInfo *info, void *acc, int x, int y, int z)
-{
-  const int data_offset = x + info->width * y + info->width * info->height * z;
-  const int texture_type = info->data_type;
-
-  /* Float4 */
-  if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
-    return tex_fetch(float4, info, data_offset);
-  }
-  /* Byte4 */
-  else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
-    uchar4 r = tex_fetch(uchar4, info, data_offset);
-    float f = 1.0f / 255.0f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Ushort4 */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
-    ushort4 r = tex_fetch(ushort4, info, data_offset);
-    float f = 1.0f / 65535.f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Float */
-  else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
-    float f = tex_fetch(float, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* UShort */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
-    ushort r = tex_fetch(ushort, info, data_offset);
-    float f = r * (1.0f / 65535.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-#ifdef WITH_NANOVDB
-  /* NanoVDB Float */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    float f = cnanovdb_readaccessor_getValueF((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* NanoVDB Float3 */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    cnanovdb_Vec3F f = cnanovdb_readaccessor_getValueF3((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f.mVec[0], f.mVec[1], f.mVec[2], 1.0f);
-  }
-#endif
-#ifdef __KERNEL_CL_KHR_FP16__
-  /* Half and Half4 are optional in OpenCL */
-  else if (texture_type == IMAGE_DATA_TYPE_HALF) {
-    float f = tex_fetch(half, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
-    half4 r = tex_fetch(half4, info, data_offset);
-    return make_float4(r.x, r.y, r.z, r.w);
-  }
-#endif
-  /* Byte */
-  else {
-    uchar r = tex_fetch(uchar, info, data_offset);
-    float f = r * (1.0f / 255.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-}
-
-ccl_device_inline float4
-svm_image_texture_read_2d(KernelGlobals *kg, int id, void *acc, int x, int y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, 0);
-}
-
-ccl_device_inline float4
-svm_image_texture_read_3d(KernelGlobals *kg, int id, void *acc, int x, int y, int z)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-      z = svm_image_texture_wrap_periodic(z, info->depth);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-      z = svm_image_texture_wrap_clamp(z, info->depth);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, z);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
-  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
-  *ix = i;
-  return x - (float)i;
-}
-
-#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
-  { \
-    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
-    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
-    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
-    u[3] = (1.0f / 6.0f) * t * t * t; \
-  } \
-  (void)0
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->extension == EXTENSION_CLIP) {
-    if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    }
-  }
-
-  if (info->interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy;
-    svm_image_texture_frac(x * info->width, &ix);
-    svm_image_texture_frac(y * info->height, &iy);
-
-    return svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-  }
-  else if (info->interpolation == INTERPOLATION_LINEAR) {
-    /* Bilinear interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float4 r;
-    r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-    r += (1.0f - ty) * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy);
-    r += ty * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy + 1);
-    r += ty * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy + 1);
-    return r;
-  }
-  else {
-    /* Bicubic interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float u[4], v[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int y = 0; y < 4; y++) {
-      for (int x = 0; x < 4; x++) {
-        float weight = u[x] * v[y];
-        r += weight * svm_image_texture_read_2d(kg, id, NULL, ix + x - 1, iy + y - 1);
-      }
-    }
-    return r;
-  }
-}
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->use_transform_3d) {
-    Transform tfm = info->transform_3d;
-    P = transform_point(&tfm, P);
-  }
-
-  float x = P.x;
-  float y = P.y;
-  float z = P.z;
-
-  uint interpolation = (interp == INTERPOLATION_NONE) ? info->interpolation : interp;
-
-#ifdef WITH_NANOVDB
-  cnanovdb_readaccessor acc;
-  if (info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
-      info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    ccl_global cnanovdb_griddata *grid =
-        (ccl_global cnanovdb_griddata *)(kg->buffers[info->cl_buffer] + info->data);
-    cnanovdb_readaccessor_init(&acc, cnanovdb_treedata_rootF(cnanovdb_griddata_tree(grid)));
-  }
-  else {
-    if (info->extension == EXTENSION_CLIP) {
-      if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
-        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-      }
-    }
-
-    x *= info->width;
-    y *= info->height;
-    z *= info->depth;
-  }
-#  define NANOVDB_ACCESS_POINTER &acc
-#else
-#  define NANOVDB_ACCESS_POINTER NULL
-#endif
-
-  if (interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy, iz;
-    svm_image_texture_frac(x, &ix);
-    svm_image_texture_frac(y, &iy);
-    svm_image_texture_frac(z, &iz);
-
-    return svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-  }
-  else if (interpolation == INTERPOLATION_LINEAR) {
-    /* Trilinear interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float4 r;
-    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
-        svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-    r += (1.0f - tz) * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz);
-    r += (1.0f - tz) * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz);
-    r += (1.0f - tz) * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz);
-
-    r += tz * (1.0f - ty) * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz + 1);
-    r += tz * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz + 1);
-    r += tz * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz + 1);
-    r += tz * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz + 1);
-    return r;
-  }
-  else {
-    /* Tricubic interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float u[4], v[4], w[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int z = 0; z < 4; z++) {
-      for (int y = 0; y < 4; y++) {
-        for (int x = 0; x < 4; x++) {
-          float weight = u[x] * v[y] * w[z];
-          r += weight * svm_image_texture_read_3d(
-                            kg, id, NANOVDB_ACCESS_POINTER, ix + x - 1, iy + y - 1, iz + z - 1);
-        }
-      }
-    }
-    return r;
-  }
-#undef NANOVDB_ACCESS_POINTER
-}
-
-#undef SET_CUBIC_SPLINE_WEIGHTS
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
deleted file mode 100644
index 68ee6f1d536..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-
-#define KERNEL_NAME queue_enqueue
-#define LOCALS_TYPE QueueEnqueueLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
deleted file mode 100644
index 10d09377ba9..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_scene_intersect.h"
-
-#define KERNEL_NAME scene_intersect
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
deleted file mode 100644
index 40eaa561863..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_eval.h"
-
-#define KERNEL_NAME shader_eval
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
deleted file mode 100644
index 8c36100f762..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_setup.h"
-
-#define KERNEL_NAME shader_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
deleted file mode 100644
index bcacaa4a054..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_sort.h"
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-#define KERNEL_NAME shader_sort
-#define LOCALS_TYPE ShaderSortLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
deleted file mode 100644
index 8de250a375c..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-
-#define KERNEL_NAME shadow_blocked_ao
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
deleted file mode 100644
index 29da77022ed..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-
-#define KERNEL_NAME shadow_blocked_dl
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
deleted file mode 100644
index c3b7b09460a..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
-#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
-
-#include "kernel/kernels/opencl/kernel_data_init.cl"
-#include "kernel/kernels/opencl/kernel_path_init.cl"
-#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
-#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
-#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
-#include "kernel/kernels/opencl/kernel_shader_setup.cl"
-#include "kernel/kernels/opencl/kernel_shader_sort.cl"
-#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
-#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
-#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
-#include "kernel/kernels/opencl/kernel_buffer_update.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
deleted file mode 100644
index e123b4cd6ec..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define KERNEL_NAME_JOIN(a, b) a##_##b
-#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
-
-__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace,
-                               KERNEL_NAME)(ccl_global char *kg_global,
-                                            ccl_constant KernelData *data,
-
-                                            ccl_global void *split_data_buffer,
-                                            ccl_global char *ray_state,
-
-                                            KERNEL_BUFFER_PARAMS,
-
-                                            ccl_global int *queue_index,
-                                            ccl_global char *use_queues_flag,
-                                            ccl_global unsigned int *work_pools,
-                                            ccl_global float *buffer)
-{
-#ifdef LOCALS_TYPE
-  ccl_local LOCALS_TYPE locals;
-#endif
-
-  KernelGlobals *kg = (KernelGlobals *)kg_global;
-
-  if (ccl_local_id(0) + ccl_local_id(1) == 0) {
-    kg->data = data;
-
-    kernel_split_params.queue_index = queue_index;
-    kernel_split_params.use_queues_flag = use_queues_flag;
-    kernel_split_params.work_pools = work_pools;
-    kernel_split_params.tile.buffer = buffer;
-
-    split_data_init(kg,
-                    &kernel_split_state,
-                    ccl_global_size(0) * ccl_global_size(1),
-                    split_data_buffer,
-                    ray_state);
-  }
-
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-
-  KERNEL_NAME_EVAL(kernel, KERNEL_NAME)
-  (kg
-#ifdef LOCALS_TYPE
-   ,
-   &locals
-#endif
-  );
-}
-
-#undef KERNEL_NAME_JOIN
-#undef KERNEL_NAME_EVAL
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
deleted file mode 100644
index 2b3be38df84..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-
-#define KERNEL_NAME subsurface_scatter
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 3f9de5ab33d..8e497986dcc 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
 // clang-format on
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 76a2e41abfa..a2f9d3f759a 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index b78dc8a3a67..812c3b6e71b 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index d656723bac2..80dfbee879e 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index c5ca8616fbd..5d968ed85e0 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,7 +32,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
@@ -50,45 +50,30 @@ CCL_NAMESPACE_BEGIN
 
 using namespace OSL;
 
-static ustring u_cubic("cubic");
-static ustring u_gaussian("gaussian");
-static ustring u_burley("burley");
-static ustring u_principled("principled");
+static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
 static ustring u_random_walk("random_walk");
-static ustring u_principled_random_walk("principled_random_walk");
 
 class CBSSRDFClosure : public CClosurePrimitive {
  public:
   Bssrdf params;
+  float ior;
   ustring method;
 
   CBSSRDFClosure()
   {
-    params.texture_blur = 0.0f;
-    params.sharpness = 0.0f;
-    params.roughness = 0.0f;
+    params.roughness = FLT_MAX;
+    params.anisotropy = 1.0f;
+    ior = 1.4f;
   }
 
   void setup(ShaderData *sd, int path_flag, float3 weight)
   {
-    if (method == u_cubic) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
-    }
-    else if (method == u_gaussian) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
-    }
-    else if (method == u_burley) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
-    }
-    else if (method == u_principled) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+    if (method == u_random_walk_fixed_radius) {
+      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
     }
     else if (method == u_random_walk) {
       alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID);
     }
-    else if (method == u_principled_random_walk) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
-    }
   }
 
   void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
@@ -106,11 +91,10 @@ class CBSSRDFClosure : public CClosurePrimitive {
       /* create one closure per color channel */
       bssrdf->radius = params.radius;
       bssrdf->albedo = params.albedo;
-      bssrdf->texture_blur = params.texture_blur;
-      bssrdf->sharpness = params.sharpness;
       bssrdf->N = params.N;
       bssrdf->roughness = params.roughness;
-      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+      bssrdf->anisotropy = clamp(params.anisotropy, 0.0f, 0.9f);
+      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, clamp(ior, 1.01f, 3.8f));
     }
   }
 };
@@ -122,9 +106,9 @@ ClosureParam *closure_bssrdf_params()
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"),
       CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, ior, "ior"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.anisotropy, "anisotropy"),
       CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"),
       CLOSURE_FINISH_PARAM(CBSSRDFClosure)};
   return params;
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 7ee467a46dd..e814fcca246 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -40,10 +40,10 @@
 #include "util/util_param.h"
 
 // clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_types.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_random.h"
 
@@ -500,7 +500,7 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
 {
   /* caustic options */
   if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
 
     if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
         (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 2b7c21d0bc4..396f42080e4 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -40,22 +40,22 @@
 #include "util/util_string.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_projection.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/image.h"
+
 #include "kernel/kernel_differential.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_camera.h"
-#include "kernel/kernels/cpu/kernel_cpu_image.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+
 #include "kernel/geom/geom.h"
 #include "kernel/bvh/bvh.h"
 
+#include "kernel/kernel_color.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
 #include "kernel/kernel_projection.h"
-#include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 // clang-format on
 
@@ -147,7 +147,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -155,18 +155,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
       Transform tfm;
 
       if (time == sd->time)
-        tfm = sd->ob_tfm;
+        tfm = object_get_transform(kg, sd);
       else
         tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
 #else
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+      const Transform tfm = object_get_transform(kg, sd);
 #endif
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -184,7 +185,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -192,18 +193,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
       Transform itfm;
 
       if (time == sd->time)
-        itfm = sd->ob_itfm;
+        itfm = object_get_inverse_transform(kg, sd);
       else
         object_fetch_transform_motion_test(kg, object, time, &itfm);
 #else
-      Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+      const Transform itfm = object_get_inverse_transform(kg, sd);
 #endif
       copy_matrix(result, itfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -218,7 +220,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -250,7 +252,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -284,21 +286,18 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_tfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#endif
+      const Transform tfm = object_get_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -315,21 +314,18 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_itfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-#endif
+      const Transform tfm = object_get_inverse_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -341,7 +337,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -368,7 +364,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            ustring to)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -747,7 +743,7 @@ static bool set_attribute_matrix(const Transform &tfm, TypeDesc type, void *val)
   return false;
 }
 
-static bool get_primitive_attribute(KernelGlobals *kg,
+static bool get_primitive_attribute(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     const OSLGlobals::Attribute &attr,
                                     const TypeDesc &type,
@@ -808,7 +804,7 @@ static bool get_primitive_attribute(KernelGlobals *kg,
   }
 }
 
-static bool get_mesh_attribute(KernelGlobals *kg,
+static bool get_mesh_attribute(const KernelGlobals *kg,
                                const ShaderData *sd,
                                const OSLGlobals::Attribute &attr,
                                const TypeDesc &type,
@@ -857,8 +853,12 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
   }
 }
 
-bool OSLRenderServices::get_object_standard_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_object_standard_attribute(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      ustring name,
+                                                      TypeDesc type,
+                                                      bool derivatives,
+                                                      void *val)
 {
   /* todo: turn this into hash table? */
 
@@ -988,8 +988,12 @@ bool OSLRenderServices::get_object_standard_attribute(
     return false;
 }
 
-bool OSLRenderServices::get_background_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_background_attribute(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 ustring name,
+                                                 TypeDesc type,
+                                                 bool derivatives,
+                                                 void *val)
 {
   if (name == u_path_ray_length) {
     /* Ray Length */
@@ -998,38 +1002,32 @@ bool OSLRenderServices::get_background_attribute(
   }
   else if (name == u_path_ray_depth) {
     /* Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_diffuse_depth) {
     /* Diffuse Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->diffuse_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.diffuse_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_glossy_depth) {
     /* Glossy Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->glossy_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.glossy_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transmission_depth) {
     /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transmission_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transparent_depth) {
     /* Transparent Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transparent_bounce;
-    return set_attribute_int(f, type, derivatives, val);
-  }
-  else if (name == u_path_transmission_depth) {
-    /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transparent_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_ndc) {
@@ -1043,8 +1041,10 @@ bool OSLRenderServices::get_background_attribute(
       ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);
 
       if (derivatives) {
-        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx) - ndc[0];
-        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy) - ndc[0];
+        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) -
+                 ndc[0];
+        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) -
+                 ndc[0];
       }
     }
     else {
@@ -1079,7 +1079,7 @@ bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_attribute(
     ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val)
 {
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
   int prim_type = 0;
   int object;
 
@@ -1208,17 +1208,17 @@ bool OSLRenderServices::texture(ustring filename,
   OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
   OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kernel_globals = sd->osl_globals;
+  const KernelGlobals *kernel_globals = sd->osl_globals;
   bool status = false;
 
   switch (texture_type) {
     case OSLTextureHandle::BEVEL: {
       /* Bevel shader hack. */
       if (nchannels >= 3) {
-        PathState *state = sd->osl_path_state;
+        const IntegratorStateCPU *state = sd->osl_path_state;
         int num_samples = (int)s;
         float radius = t;
-        float3 N = svm_bevel(kernel_globals, sd, state, radius, num_samples);
+        float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
         result[0] = N.x;
         result[1] = N.y;
         result[2] = N.z;
@@ -1228,7 +1228,7 @@ bool OSLRenderServices::texture(ustring filename,
     }
     case OSLTextureHandle::AO: {
       /* AO shader hack. */
-      PathState *state = sd->osl_path_state;
+      const IntegratorStateCPU *state = sd->osl_path_state;
       int num_samples = (int)s;
       float radius = t;
       float3 N = make_float3(dsdx, dtdx, dsdy);
@@ -1242,7 +1242,7 @@ bool OSLRenderServices::texture(ustring filename,
       if ((int)options.tblur) {
         flags |= NODE_AO_GLOBAL_RADIUS;
       }
-      result[0] = svm_ao(kernel_globals, sd, N, state, radius, num_samples, flags);
+      result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
       status = true;
       break;
     }
@@ -1355,7 +1355,7 @@ bool OSLRenderServices::texture3d(ustring filename,
     case OSLTextureHandle::SVM: {
       /* Packed texture. */
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       int slot = handle->svm_slot;
       float3 P_float3 = make_float3(P.x, P.y, P.z);
       float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
@@ -1377,7 +1377,7 @@ bool OSLRenderServices::texture3d(ustring filename,
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == NULL) {
           ShaderData *sd = (ShaderData *)(sg->renderstate);
-          KernelGlobals *kernel_globals = sd->osl_globals;
+          const KernelGlobals *kernel_globals = sd->osl_globals;
           OSLThreadData *tdata = kernel_globals->osl_tdata;
           texture_thread_info = tdata->oiio_thread_info;
         }
@@ -1462,7 +1462,7 @@ bool OSLRenderServices::environment(ustring filename,
   if (handle && handle->oiio_handle) {
     if (thread_info == NULL) {
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       OSLThreadData *tdata = kernel_globals->osl_tdata;
       thread_info = tdata->oiio_thread_info;
     }
@@ -1600,10 +1600,14 @@ bool OSLRenderServices::trace(TraceOpt &options,
   }
 
   /* ray differentials */
-  ray.dP.dx = TO_FLOAT3(dPdx);
-  ray.dP.dy = TO_FLOAT3(dPdy);
-  ray.dD.dx = TO_FLOAT3(dRdx);
-  ray.dD.dy = TO_FLOAT3(dRdy);
+  differential3 dP;
+  dP.dx = TO_FLOAT3(dPdx);
+  dP.dy = TO_FLOAT3(dPdy);
+  ray.dP = differential_make_compact(dP);
+  differential3 dD;
+  dD.dx = TO_FLOAT3(dRdx);
+  dD.dy = TO_FLOAT3(dRdy);
+  ray.dD = differential_make_compact(dD);
 
   /* allocate trace data */
   OSLTraceData *tracedata = (OSLTraceData *)sg->tracedata;
@@ -1613,7 +1617,7 @@ bool OSLRenderServices::trace(TraceOpt &options,
   tracedata->hit = false;
   tracedata->sd.osl_globals = sd->osl_globals;
 
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   /* Can't raytrace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
@@ -1646,11 +1650,11 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
       }
       else {
         ShaderData *sd = &tracedata->sd;
-        KernelGlobals *kg = sd->osl_globals;
+        const KernelGlobals *kg = sd->osl_globals;
 
         if (!tracedata->setup) {
           /* lazy shader data setup */
-          shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray);
+          shader_setup_from_ray(kg, sd, &tracedata->ray, &tracedata->isect);
           tracedata->setup = true;
         }
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 891b9172dd4..58accb46e7d 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -250,10 +250,18 @@ class OSLRenderServices : public OSL::RendererServices {
                         void *data) override;
 #endif
 
-  static bool get_background_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
-  static bool get_object_standard_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
+  static bool get_background_attribute(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       ustring name,
+                                       TypeDesc type,
+                                       bool derivatives,
+                                       void *val);
+  static bool get_object_standard_attribute(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            ustring name,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            void *val);
 
   static ustring u_distance;
   static ustring u_index;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 389c854c495..880ef635c76 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -17,14 +17,16 @@
 #include <OSL/oslexec.h>
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 
 #include "kernel/geom/geom_object.h"
 
+#include "kernel/integrator/integrator_state.h"
+
 #include "kernel/osl/osl_closures.h"
 #include "kernel/osl/osl_globals.h"
 #include "kernel/osl/osl_services.h"
@@ -39,9 +41,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Threads */
 
-void OSLShader::thread_init(KernelGlobals *kg,
-                            KernelGlobals *kernel_globals,
-                            OSLGlobals *osl_globals)
+void OSLShader::thread_init(KernelGlobals *kg, OSLGlobals *osl_globals)
 {
   /* no osl used? */
   if (!osl_globals->use) {
@@ -87,8 +87,11 @@ void OSLShader::thread_free(KernelGlobals *kg)
 
 /* Globals */
 
-static void shaderdata_to_shaderglobals(
-    KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, OSLThreadData *tdata)
+static void shaderdata_to_shaderglobals(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        const IntegratorStateCPU *state,
+                                        int path_flag,
+                                        OSLThreadData *tdata)
 {
   OSL::ShaderGlobals *globals = &tdata->globals;
 
@@ -171,7 +174,10 @@ static void flatten_surface_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_surface(const KernelGlobals *kg,
+                             const IntegratorStateCPU *state,
+                             ShaderData *sd,
+                             int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -276,7 +282,10 @@ static void flatten_background_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_background(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd,
+                                int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -331,7 +340,10 @@ static void flatten_volume_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_volume(const KernelGlobals *kg,
+                            const IntegratorStateCPU *state,
+                            ShaderData *sd,
+                            int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -354,7 +366,9 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
+void OSLShader::eval_displacement(const KernelGlobals *kg,
+                                  const IntegratorStateCPU *state,
+                                  ShaderData *sd)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -377,7 +391,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *
 
 /* Attributes */
 
-int OSLShader::find_attribute(KernelGlobals *kg,
+int OSLShader::find_attribute(const KernelGlobals *kg,
                               const ShaderData *sd,
                               uint id,
                               AttributeDescriptor *desc)
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index a4fa24d0a90..f1f17b141eb 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -37,6 +37,7 @@ class Scene;
 
 struct ShaderClosure;
 struct ShaderData;
+struct IntegratorStateCPU;
 struct differential3;
 struct KernelGlobals;
 
@@ -49,19 +50,28 @@ class OSLShader {
   static void register_closures(OSLShadingSystem *ss);
 
   /* per thread data */
-  static void thread_init(KernelGlobals *kg,
-                          KernelGlobals *kernel_globals,
-                          OSLGlobals *osl_globals);
+  static void thread_init(KernelGlobals *kg, OSLGlobals *osl_globals);
   static void thread_free(KernelGlobals *kg);
 
   /* eval */
-  static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
+  static void eval_surface(const KernelGlobals *kg,
+                           const IntegratorStateCPU *state,
+                           ShaderData *sd,
+                           int path_flag);
+  static void eval_background(const KernelGlobals *kg,
+                              const IntegratorStateCPU *state,
+                              ShaderData *sd,
+                              int path_flag);
+  static void eval_volume(const KernelGlobals *kg,
+                          const IntegratorStateCPU *state,
+                          ShaderData *sd,
+                          int path_flag);
+  static void eval_displacement(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd);
 
   /* attributes */
-  static int find_attribute(KernelGlobals *kg,
+  static int find_attribute(const KernelGlobals *kg,
                             const ShaderData *sd,
                             uint id,
                             AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 23949f406c7..55afb892d36 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -18,11 +18,13 @@
 #include "stdcycles.h"
 
 shader node_principled_bsdf(string distribution = "Multiscatter GGX",
-                            string subsurface_method = "burley",
+                            string subsurface_method = "random_walk",
                             color BaseColor = color(0.8, 0.8, 0.8),
                             float Subsurface = 0.0,
                             vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
                             color SubsurfaceColor = color(0.7, 0.1, 0.1),
+                            float SubsurfaceIOR = 1.4,
+                            float SubsurfaceAnisotropy = 0.0,
                             float Metallic = 0.0,
                             float Specular = 0.5,
                             float SpecularTint = 0.0,
@@ -59,22 +61,17 @@ shader node_principled_bsdf(string distribution = "Multiscatter GGX",
   if (diffuse_weight > 1e-5) {
     if (Subsurface > 1e-5) {
       color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
-      if (subsurface_method == "burley") {
-        BSDF = mixed_ss_base_color * bssrdf("principled",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            SubsurfaceColor,
-                                            "roughness",
-                                            Roughness);
-      }
-      else {
-        BSDF = mixed_ss_base_color * bssrdf("principled_random_walk",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            mixed_ss_base_color,
-                                            "roughness",
-                                            Roughness);
-      }
+
+      BSDF = mixed_ss_base_color * bssrdf(subsurface_method,
+                                          Normal,
+                                          Subsurface * SubsurfaceRadius,
+                                          mixed_ss_base_color,
+                                          "roughness",
+                                          Roughness,
+                                          "ior",
+                                          SubsurfaceIOR,
+                                          "anisotropy",
+                                          SubsurfaceAnisotropy);
     }
     else {
       BSDF = BaseColor * principled_diffuse(Normal, Roughness);
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index b1e854150ab..f55e38c54ff 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -19,27 +19,12 @@
 shader node_subsurface_scattering(color Color = 0.8,
                                   float Scale = 1.0,
                                   vector Radius = vector(0.1, 0.1, 0.1),
-                                  float TextureBlur = 0.0,
-                                  float Sharpness = 0.0,
-                                  string falloff = "cubic",
+                                  float IOR = 1.4,
+                                  float Anisotropy = 0.0,
+                                  string method = "random_walk",
                                   normal Normal = N,
                                   output closure color BSSRDF = 0)
 {
-  if (falloff == "gaussian")
-    BSSRDF = Color *
-             bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else if (falloff == "cubic")
-    BSSRDF = Color * bssrdf("cubic",
-                            Normal,
-                            Scale * Radius,
-                            Color,
-                            "texture_blur",
-                            TextureBlur,
-                            "sharpness",
-                            Sharpness);
-  else if (falloff == "burley")
-    BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else
-    BSSRDF = Color *
-             bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
+  BSSRDF = Color *
+           bssrdf(method, Normal, Scale * Radius, Color, "ior", IOR, "anisotropy", Anisotropy);
 }
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
deleted file mode 100644
index 437a5c9581b..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
-    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-      float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-      if (sample_multiplier != 1.0f) {
-        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-      }
-    }
-    else {
-      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
deleted file mode 100644
index 93f41f7ced4..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int y = kernel_split_params.tile.y + pixel_index;
-    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
deleted file mode 100644
index eca53d079ec..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index;
-    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
deleted file mode 100644
index c8eb1ebd705..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_stopping.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    kernel_do_adaptive_stopping(kg,
-                                buffer,
-                                kernel_split_params.tile.start_sample +
-                                    kernel_split_params.tile.num_samples - 1);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
deleted file mode 100644
index 45f5037d321..00000000000
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-/* sets up the various state needed to do an indirect loop */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg,
-                                                                     int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* save a copy of the state to restore later */
-#  define BRANCHED_STORE(name) branched_state->name = kernel_split_state.name[ray_index];
-
-  BRANCHED_STORE(path_state);
-  BRANCHED_STORE(throughput);
-  BRANCHED_STORE(ray);
-  BRANCHED_STORE(isect);
-  BRANCHED_STORE(ray_state);
-
-  *kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(branched_state_sd, ray_index)->closure[i] =
-        kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_STORE
-
-  /* Set loop counters to initial position. */
-  branched_state->next_closure = 0;
-  branched_state->next_sample = 0;
-}
-
-/* ends an indirect loop and restores the previous state */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg,
-                                                                    int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* restore state */
-#  define BRANCHED_RESTORE(name) kernel_split_state.name[ray_index] = branched_state->name;
-
-  BRANCHED_RESTORE(path_state);
-  BRANCHED_RESTORE(throughput);
-  BRANCHED_RESTORE(ray);
-  BRANCHED_RESTORE(isect);
-  BRANCHED_RESTORE(ray_state);
-
-  *kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, ray_index)->closure[i] =
-        kernel_split_sd(branched_state_sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_RESTORE
-
-  /* leave indirect loop */
-  REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
-}
-
-ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg,
-                                                                   int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
-                                       kernel_split_state.queue_data,
-                                       kernel_split_params.queue_size,
-                                       kernel_split_params.queue_index);
-
-  if (!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
-    return false;
-  }
-
-#  define SPLIT_DATA_ENTRY(type, name, num) \
-    if (num) { \
-      kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \
-    }
-  SPLIT_DATA_ENTRIES_BRANCHED_SHARED
-#  undef SPLIT_DATA_ENTRY
-
-  *kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-  kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
-  kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
-  kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
-
-  path_radiance_init(kg, inactive_L);
-  path_radiance_copy_indirect(inactive_L, L);
-
-  ray_state[inactive_ray] = RAY_REGENERATED;
-  ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
-  ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
-
-  atomic_fetch_and_inc_uint32(
-      (ccl_global uint *)&kernel_split_state.branched_state[ray_index].shared_sample_count);
-
-  return true;
-}
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
-    KernelGlobals *kg,
-    int ray_index,
-    float num_samples_adjust,
-    ShaderData *saved_sd,
-    bool reset_path_state,
-    bool wait_for_shared)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = saved_sd;
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  float3 throughput = branched_state->throughput;
-  ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-
-  float sum_sample_weight = 0.0f;
-#  ifdef __DENOISING_FEATURES__
-  if (ps->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#  endif /* __DENOISING_FEATURES__ */
-
-  for (int i = branched_state->next_closure; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSDF(sc->type))
-      continue;
-    /* transparency is not handled here, but in outer loop */
-    if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
-      continue;
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = branched_state->next_sample; j < num_samples; j++) {
-      if (reset_path_state) {
-        *ps = branched_state->path_state;
-      }
-
-      ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-      ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-      *tp = throughput;
-
-      ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, tp, ps, &L->state, bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps->rng_hash = branched_state->path_state.rng_hash;
-
-      /* update state for next iteration */
-      branched_state->next_closure = i;
-      branched_state->next_sample = j + 1;
-
-      /* start the indirect path */
-      *tp *= num_samples_inv;
-
-      if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-        continue;
-      }
-
-      return true;
-    }
-
-    branched_state->next_sample = 0;
-  }
-
-  branched_state->next_closure = sd->num_closure;
-
-  if (wait_for_shared) {
-    branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-    if (branched_state->waiting_on_shared_samples) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
deleted file mode 100644
index b96feca582f..00000000000
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of rays that hit the background (sceneintersect
- * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
- * accumulated radiance in the output buffer. This kernel also takes care of
- * rays that have been determined to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel.
- * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
- * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- */
-ccl_device void kernel_buffer_update(KernelGlobals *kg,
-                                     ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (ray_index == 0) {
-    /* We will empty this queue in this kernel. */
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-  char enqueue_flag = 0;
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    bool ray_was_updated = false;
-
-    if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-      ray_was_updated = true;
-      uint sample = state->sample;
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      /* accumulate result in output buffer */
-      kernel_write_result(kg, buffer, sample, L);
-
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-    }
-
-    if (kernel_data.film.cryptomatte_passes) {
-      /* Make sure no thread is writing to the buffers. */
-      ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      if (ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
-        uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
-        kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-      /* We have completed current work; So get next work */
-      ccl_global uint *work_pools = kernel_split_params.work_pools;
-      uint total_work_size = kernel_split_params.total_work_size;
-      uint work_index;
-
-      if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-        /* If work is invalid, this means no more work is available and the thread may exit */
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-      }
-
-      if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, work_index, &x, &y, &sample);
-
-        /* Store buffer offset for writing to passes. */
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-        /* Initialize random numbers and ray. */
-        uint rng_hash;
-        kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
-
-        if (ray->t != 0.0f) {
-          /* Initialize throughput, path radiance, Ray, PathState;
-           * These rays proceed with path-iteration.
-           */
-          *throughput = make_float3(1.0f, 1.0f, 1.0f);
-          path_radiance_init(kg, L);
-          path_state_init(kg,
-                          AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                          state,
-                          rng_hash,
-                          sample,
-                          ray);
-#ifdef __SUBSURFACE__
-          kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-          enqueue_flag = 1;
-        }
-        else {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-        }
-      }
-    }
-  }
-
-  /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-   * These rays will be made active during next SceneIntersectkernel.
-   */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
deleted file mode 100644
index 2f83a10316d..00000000000
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel Initializes structures needed in path-iteration kernels.
- *
- * Note on Queues:
- * All slots in queues are initialized to queue empty slot;
- * The number of elements in the queues is initialized to 0;
- */
-
-#ifndef __KERNEL_CPU__
-ccl_device void kernel_data_init(
-#else
-void KERNEL_FUNCTION_FULL_NAME(data_init)(
-#endif
-    KernelGlobals *kg,
-    ccl_constant KernelData *data,
-    ccl_global void *split_data_buffer,
-    int num_elements,
-    ccl_global char *ray_state,
-
-#ifdef __KERNEL_OPENCL__
-    KERNEL_BUFFER_PARAMS,
-#endif
-
-    int start_sample,
-    int end_sample,
-    int sx,
-    int sy,
-    int sw,
-    int sh,
-    int offset,
-    int stride,
-    ccl_global int *Queue_index,      /* Tracks the number of elements in queues */
-    int queuesize,                    /* size (capacity) of the queue */
-    ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues
-                                         to fetch ray index */
-    ccl_global unsigned int *work_pools, /* Work pool for each work group */
-    unsigned int num_samples,
-    ccl_global float *buffer)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, data_init);
-#else
-
-#  ifdef __KERNEL_OPENCL__
-  kg->data = data;
-#  endif
-
-  kernel_split_params.tile.x = sx;
-  kernel_split_params.tile.y = sy;
-  kernel_split_params.tile.w = sw;
-  kernel_split_params.tile.h = sh;
-
-  kernel_split_params.tile.start_sample = start_sample;
-  kernel_split_params.tile.num_samples = num_samples;
-
-  kernel_split_params.tile.offset = offset;
-  kernel_split_params.tile.stride = stride;
-
-  kernel_split_params.tile.buffer = buffer;
-
-  kernel_split_params.total_work_size = sw * sh * num_samples;
-
-  kernel_split_params.work_pools = work_pools;
-
-  kernel_split_params.queue_index = Queue_index;
-  kernel_split_params.queue_size = queuesize;
-  kernel_split_params.use_queues_flag = use_queues_flag;
-
-  split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
-
-#  ifdef __KERNEL_OPENCL__
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-  kernel_set_buffer_info(kg);
-#  endif
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  /* Initialize queue data and queue index. */
-  if (thread_index < queuesize) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-    }
-  }
-
-  if (thread_index == 0) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      Queue_index[i] = 0;
-    }
-
-    /* The scene-intersect kernel should not use the queues very first time.
-     * since the queue would be empty.
-     */
-    *use_queues_flag = 0;
-  }
-#endif /* KERENL_STUB */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
deleted file mode 100644
index 3be2b35812f..00000000000
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of direct lighting logic.
- * However, the "shadow ray cast" part of direct lighting is handled
- * in the next kernel.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with direct lighting should be executed. Those rays for which
- * a shadow_blocked() function for direct-lighting must be executed, are
- * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS
- *
- * Note on Queues:
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
- * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
- * the corresponding shadow_blocked part, after direct lighting, the ray is
- * marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called:
- * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
- *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
- *   kernel call.
- * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
- *   shadow_blocked function must be executed, after this kernel call
- *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
- */
-ccl_device void kernel_direct_lighting(KernelGlobals *kg,
-                                       ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  char enqueue_flag = 0;
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    /* direct lighting */
-#ifdef __EMISSION__
-    bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL));
-
-#  ifdef __BRANCHED_PATH__
-    if (flag && kernel_data.integrator.branched) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __BRANCHED_PATH__ */
-
-#  ifdef __SHADOW_TRICKS__
-    if (flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __SHADOW_TRICKS__ */
-
-    if (flag) {
-      /* Sample illumination from lights to find path contribution. */
-      float light_u, light_v;
-      path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-      float terminate = path_state_rng_light_termination(kg, state);
-
-      LightSample ls;
-      if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-        Ray light_ray;
-        light_ray.time = sd->time;
-
-        BsdfEval L_light;
-        bool is_lamp;
-        if (direct_emission(kg,
-                            sd,
-                            AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                            &ls,
-                            state,
-                            &light_ray,
-                            &L_light,
-                            &is_lamp,
-                            terminate)) {
-          /* Write intermediate data to global memory to access from
-           * the next kernel.
-           */
-          kernel_split_state.light_ray[ray_index] = light_ray;
-          kernel_split_state.bsdf_eval[ray_index] = L_light;
-          kernel_split_state.is_lamp[ray_index] = is_lamp;
-          /* Mark ray state for next shadow kernel. */
-          enqueue_flag = 1;
-        }
-      }
-    }
-#endif /* __EMISSION__ */
-  }
-
-#ifdef __EMISSION__
-  /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
-   * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
-   */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_LIGHT_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
deleted file mode 100644
index 1775e870f07..00000000000
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
-
-ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg,
-                                                                             int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg,
-                                                                               int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  /* GPU: no decoupled ray marching, scatter probabilistically. */
-  int num_samples = kernel_data.integrator.volume_samples;
-  float num_samples_inv = 1.0f / num_samples;
-
-  Ray volume_ray = branched_state->ray;
-  volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ?
-                     branched_state->isect.t :
-                     FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack);
-
-  for (int j = branched_state->next_sample; j < num_samples; j++) {
-    ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-    *ps = branched_state->path_state;
-
-    ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
-    *pray = branched_state->ray;
-
-    ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-    *tp = branched_state->throughput * num_samples_inv;
-
-    /* branch RNG state */
-    path_state_branch(ps, j, num_samples);
-
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, ps, sd, &volume_ray, L, tp, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
-
-      /* indirect light bounce */
-      if (!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
-        continue;
-      }
-
-      /* start the indirect path */
-      branched_state->next_closure = 0;
-      branched_state->next_sample = j + 1;
-
-      /* Attempting to share too many samples is slow for volumes as it causes us to
-       * loop here more and have many calls to kernel_volume_integrate which evaluates
-       * shaders. The many expensive shader evaluations cause the work load to become
-       * unbalanced and many threads to become idle in this kernel. Limiting the
-       * number of shared samples here helps quite a lot.
-       */
-      if (branched_state->shared_sample_count < 2) {
-        if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-          continue;
-        }
-      }
-
-      return true;
-    }
-#  endif
-  }
-
-  branched_state->next_sample = num_samples;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  /* todo: avoid this calculation using decoupled ray marching */
-  float3 throughput = kernel_split_state.throughput[ray_index];
-  kernel_volume_shadow(
-      kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
-  kernel_split_state.throughput[ray_index] = throughput;
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __VOLUME__ */
-
-ccl_device void kernel_do_volume(KernelGlobals *kg)
-{
-#ifdef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-#  ifdef __BRANCHED_PATH__
-    kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
-#  endif /* __BRANCHED_PATH__ */
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (*kernel_split_params.use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    bool hit = !IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
-
-    /* Sanitize volume stack. */
-    if (!hit) {
-      kernel_volume_clean_stack(kg, state->volume_stack);
-    }
-    /* volume attenuation, emission, scatter */
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      Ray volume_ray = *ray;
-      volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif /* __BRANCHED_PATH__ */
-        float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-        {
-          /* integrate along volume segment with distance sampling */
-          VolumeIntegrateResult result = kernel_volume_integrate(
-              kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-          if (result == VOLUME_PATH_SCATTERED) {
-            /* direct lighting */
-            kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-            /* indirect light bounce */
-            if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
-              ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-            }
-            else {
-              kernel_split_path_end(kg, ray_index);
-            }
-          }
-#  endif /* __VOLUME_SCATTER__ */
-        }
-
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif /* __BRANCHED_PATH__ */
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_VOLUME_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __VOLUME__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
deleted file mode 100644
index 745313f89f1..00000000000
--- a/intern/cycles/kernel/split/kernel_enqueue_inactive.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
-                                        ccl_local_param unsigned int *local_queue_atomics)
-{
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  char enqueue_flag = 0;
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
-    enqueue_flag = 1;
-  }
-
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_INACTIVE_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
deleted file mode 100644
index 61722840b0b..00000000000
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of the logic to process "material of type holdout",
- * indirect primitive emission, bsdf blurring, probabilistic path termination
- * and AO.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with AO should be executed. Those rays for which a
- * shadow_blocked() function for AO must be executed are marked with flag
- * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS
- *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
- *
- * Note on Queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
- *     flag RAY_SHADOW_RAY_CAST_AO
- */
-
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
-    KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    locals->queue_atomics_bg = 0;
-    locals->queue_atomics_ao = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-#ifdef __AO__
-  char enqueue_flag = 0;
-#endif
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global PathState *state = 0x0;
-    float3 throughput;
-
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-      ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-      PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-      throughput = kernel_split_state.throughput[ray_index];
-      state = &kernel_split_state.path_state[ray_index];
-
-      if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, buffer)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* Path termination. this is a strange place to put the termination, it's
-       * mainly due to the mixed in MIS that we use. gives too many unneeded
-       * shader evaluations, only need emission if we are going to terminate.
-       */
-      float probability = path_state_continuation_probability(kg, state, throughput);
-
-      if (probability == 0.0f) {
-        kernel_split_path_end(kg, ray_index);
-      }
-      else if (probability < 1.0f) {
-        float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-        if (terminate >= probability) {
-          kernel_split_path_end(kg, ray_index);
-        }
-        else {
-          kernel_split_state.throughput[ray_index] = throughput / probability;
-        }
-      }
-
-#ifdef __DENOISING_FEATURES__
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-        kernel_update_denoising_features(kg, sd, state, L);
-      }
-#endif
-    }
-
-#ifdef __AO__
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        enqueue_flag = 1;
-      }
-    }
-#endif /* __AO__ */
-  }
-
-#ifdef __AO__
-  /* Enqueue to-shadow-ray-cast rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          &locals->queue_atomics_ao,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
deleted file mode 100644
index 6d500650cc0..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_background(KernelGlobals *kg)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int ray_index;
-
-  if (kernel_data.integrator.ao_bounces != INT_MAX) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index != QUEUE_EMPTY_SLOT) {
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-        if (path_state_ao_bounce(kg, state)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-      }
-    }
-  }
-
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  if (IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    kernel_path_background(kg, state, ray, throughput, sd, buffer, L);
-    kernel_split_path_end(kg, ray_index);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
deleted file mode 100644
index 3f48f8d6f56..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index;
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-#ifdef __SUBSURFACE__
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect->num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, ss_indirect, state, ray, L, throughput);
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
deleted file mode 100644
index 7ecb099208d..00000000000
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
- * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- */
-ccl_device void kernel_lamp_emission(KernelGlobals *kg)
-{
-#ifndef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-  }
-#endif
-  /* Fetch use_queues_flag. */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-#ifndef __VOLUME__
-                              1
-#else
-                              0
-#endif
-    );
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
deleted file mode 100644
index 320f6a414bf..00000000000
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/*This kernel takes care of setting up ray for the next iteration of
- * path-iteration and accumulating radiance corresponding to AO and
- * direct-lighting
- *
- * Ray state of rays that are terminated in this kernel are changed
- * to RAY_UPDATE_BUFFER.
- *
- * Note on queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFF state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
- */
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
-}
-
-ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index)
-{
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-
-#  ifdef __VOLUME__
-  if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-    /* continue in case of transparency */
-    *throughput *= shader_bsdf_transparency(kg, sd);
-
-    if (is_zero(*throughput)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-
-    /* Update Path State */
-    path_state_next(kg, state, LABEL_TRANSPARENT);
-#  ifdef __VOLUME__
-  }
-  else {
-    if (!path_state_volume_next(kg, state)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-  }
-#  endif
-
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->t -= sd->ray_length; /* clipping works through transparent */
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD.dx = -sd->dI.dx;
-  ray->dD.dy = -sd->dI.dy;
-#  endif /* __RAY_DIFFERENTIALS__ */
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif /* __VOLUME__ */
-}
-#endif /* __BRANCHED_PATH__ */
-
-ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
-                                            ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    /* If we are here, then it means that scene-intersect kernel
-     * has already been executed at least once. From the next time,
-     * scene-intersect kernel may operate on queues to fetch ray index
-     */
-    *kernel_split_params.use_queues_flag = 1;
-
-    /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-     * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-     * previous kernel.
-     */
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __VOLUME__
-  /* Reactivate only volume rays here, most surface work was skipped. */
-  if (IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-  }
-#endif
-
-  bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
-  if (active) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
-    if (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-      /* Compute direct lighting and next bounce. */
-      if (!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-#ifdef __BRANCHED_PATH__
-    }
-    else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-    else {
-      kernel_split_branched_indirect_light_init(kg, ray_index);
-
-      if (kernel_split_branched_path_surface_indirect_light_iter(
-              kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-      }
-      else {
-        kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-        kernel_split_branched_transparent_bounce(kg, ray_index);
-      }
-    }
-#endif /* __BRANCHED_PATH__ */
-  }
-
-  /* Enqueue RAY_UPDATE_BUFFER rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-#ifdef __BRANCHED_PATH__
-  /* iter loop */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
-  }
-
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_LIGHT_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_reset_indirect(L);
-
-    if (kernel_split_branched_path_surface_indirect_light_iter(
-            kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-    else {
-      kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-  }
-
-#  ifdef __VOLUME__
-  /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_VOLUME_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#  endif /* __VOLUME__ */
-
-#  ifdef __SUBSURFACE__
-  /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_SUBSURFACE_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-#  endif /* __SUBSURFACE__ */
-#endif   /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
deleted file mode 100644
index c686f46a0cd..00000000000
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- */
-ccl_device void kernel_path_init(KernelGlobals *kg)
-{
-  int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-  /* This is the first assignment to ray_state;
-   * So we don't use ASSIGN_RAY_STATE macro.
-   */
-  kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
-
-  /* Get work. */
-  ccl_global uint *work_pools = kernel_split_params.work_pools;
-  uint total_work_size = kernel_split_params.total_work_size;
-  uint work_index;
-
-  if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-    /* No more work, mark ray as inactive */
-    kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
-
-    return;
-  }
-
-  ccl_global WorkTile *tile = &kernel_split_params.tile;
-  uint x, y, sample;
-  get_work_pixel(tile, work_index, &x, &y, &sample);
-
-  /* Store buffer offset for writing to passes. */
-  uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-  kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-  /* Initialize random numbers and ray. */
-  uint rng_hash;
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &kernel_split_state.ray[ray_index]);
-
-  if (kernel_split_state.ray[ray_index].t != 0.0f) {
-    /* Initialize throughput, path radiance, Ray, PathState;
-     * These rays proceed with path-iteration.
-     */
-    kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-    path_radiance_init(kg, &kernel_split_state.path_radiance[ray_index]);
-    path_state_init(kg,
-                    AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                    &kernel_split_state.path_state[ray_index],
-                    rng_hash,
-                    sample,
-                    &kernel_split_state.ray[ray_index]);
-#ifdef __SUBSURFACE__
-    kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-  }
-  else {
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
deleted file mode 100644
index 2db87f7a671..00000000000
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel enqueues rays of different ray state into their
- * appropriate queues:
- *
- * 1. Rays that have been determined to hit the background from the
- *    "kernel_scene_intersect" kernel are enqueued in
- *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in pat
- *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queue during other times this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
- *     and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-ccl_device void kernel_queue_enqueue(KernelGlobals *kg, ccl_local_param QueueEnqueueLocals *locals)
-{
-  /* We have only 2 cases (Hit/Not-Hit) */
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (lidx == 0) {
-    locals->queue_atomics[0] = 0;
-    locals->queue_atomics[1] = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int queue_number = -1;
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
-    queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-  }
-  else if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-    queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-  }
-
-  unsigned int my_lqidx;
-  if (queue_number != -1) {
-    my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (lidx == 0) {
-    locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(
-        QUEUE_ACTIVE_AND_REGENERATED_RAYS, locals->queue_atomics, kernel_split_params.queue_index);
-    locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(
-        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-        locals->queue_atomics,
-        kernel_split_params.queue_index);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  unsigned int my_gqidx;
-  if (queue_number != -1) {
-    my_gqidx = get_global_queue_index(
-        queue_number, kernel_split_params.queue_size, my_lqidx, locals->queue_atomics);
-    kernel_split_state.queue_data[my_gqidx] = ray_index;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
deleted file mode 100644
index 9ac95aafd2f..00000000000
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of scene_intersect function.
- *
- * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
- * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes
- * their ray state to RAY_HIT_BACKGROUND.
- */
-ccl_device void kernel_scene_intersect(KernelGlobals *kg)
-{
-  /* Fetch use_queues_flag */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  /* All regenerated rays become active here */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-#ifdef __BRANCHED_PATH__
-    if (kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
-      kernel_split_path_end(kg, ray_index);
-    }
-    else
-#endif /* __BRANCHED_PATH__ */
-    {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-    }
-  }
-
-  if (!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    return;
-  }
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-  Intersection isect;
-  const int last_object = state->bounce > 0 ?
-                              intersection_get_object(kg, &kernel_split_state.isect[ray_index]) :
-                              OBJECT_NONE;
-  bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L, last_object);
-  kernel_split_state.isect[ray_index] = isect;
-
-  if (!hit) {
-    /* Change the state of rays that hit the background;
-     * These rays undergo special processing in the
-     * background_bufferUpdate kernel.
-     */
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
deleted file mode 100644
index c760a2b2049..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel evaluates ShaderData structure from the values computed
- * by the previous kernels.
- */
-ccl_device void kernel_shader_eval(KernelGlobals *kg)
-{
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  /* Sorting on cuda split is not implemented */
-#ifdef __KERNEL_CUDA__
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-#else
-  int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
-#endif
-  if (ray_index >= queue_index) {
-    return;
-  }
-  ray_index = get_ray_index(kg,
-                            ray_index,
-#ifdef __KERNEL_CUDA__
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-#else
-                            QUEUE_SHADER_SORTED_RAYS,
-#endif
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, buffer, state->flag);
-#ifdef __BRANCHED_PATH__
-    if (kernel_data.integrator.branched) {
-      shader_merge_closures(kernel_split_sd(sd, ray_index));
-    }
-    else
-#endif
-    {
-      shader_prepare_closures(kernel_split_sd(sd, ray_index), state);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
deleted file mode 100644
index 551836d1653..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_setup.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel sets up the ShaderData structure from the values computed
- * by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- */
-ccl_device void kernel_shader_setup(KernelGlobals *kg,
-                                    ccl_local_param unsigned int *local_queue_atomics)
-{
-  /* Enqueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (ray_index < queue_index) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-  }
-  else {
-    ray_index = QUEUE_EMPTY_SLOT;
-  }
-
-  char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 :
-                                                                                               0;
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-  /* Continue on with shader evaluation. */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    Intersection isect = kernel_split_state.isect[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    shader_setup_from_ray(kg, sd, &isect, &ray);
-
-#ifdef __VOLUME__
-    if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME);
-    }
-#endif
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
deleted file mode 100644
index 95d33a42014..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local_param ShaderSortLocals *locals)
-{
-#ifndef __KERNEL_CUDA__
-  int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (tid == 0) {
-    kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
-  }
-
-  uint offset = (tid / SHADER_SORT_LOCAL_SIZE) * SHADER_SORT_BLOCK_SIZE;
-  if (offset >= qsize) {
-    return;
-  }
-
-  int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
-  uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
-  ccl_local uint *local_value = &locals->local_value[0];
-  ccl_local ushort *local_index = &locals->local_index[0];
-
-  /* copy to local memory */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint add = input + idx;
-    uint value = (~0);
-    if (idx < qsize) {
-      int ray_index = kernel_split_state.queue_data[add];
-      bool valid = (ray_index != QUEUE_EMPTY_SLOT) &&
-                   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-      if (valid) {
-        value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK;
-      }
-    }
-    local_value[i + lid] = value;
-    local_index[i + lid] = i + lid;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* skip sorting for cpu split kernel */
-#  ifdef __KERNEL_OPENCL__
-
-  /* bitonic sort */
-  for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-    for (uint inc = length; inc > 0; inc >>= 1) {
-      for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
-        uint i = lid + ii;
-        bool direction = ((i & (length << 1)) != 0);
-        uint j = i ^ inc;
-        ushort ioff = local_index[i];
-        ushort joff = local_index[j];
-        uint iKey = local_value[ioff];
-        uint jKey = local_value[joff];
-        bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
-        bool swap = smaller ^ (j < i) ^ direction;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-        local_index[i] = (swap) ? joff : ioff;
-        local_index[j] = (swap) ? ioff : joff;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      }
-    }
-  }
-#  endif /* __KERNEL_OPENCL__ */
-
-  /* copy to destination */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint lidx = local_index[i + lid];
-    uint outi = output + idx;
-    uint ini = input + offset + lidx;
-    uint value = local_value[lidx];
-    if (idx < qsize) {
-      kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT :
-                                                              kernel_split_state.queue_data[ini];
-    }
-  }
-#endif /* __KERNEL_CUDA__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
deleted file mode 100644
index 5d772fc597b..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for AO. */
-ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
-{
-  unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < ao_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-#ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched ||
-      IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-    kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
-#ifdef __BRANCHED_PATH__
-  }
-  else {
-    kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
-  }
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
deleted file mode 100644
index 5e46d300bca..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for direct visible light. */
-ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
-{
-  unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < dl_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-#ifdef __BRANCHED_PATH__
-  /* TODO(mai): move this somewhere else? */
-  if (thread_index == 0) {
-    /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
-    kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
-  }
-#endif /* __BRANCHED_PATH__ */
-
-  if (ray_index == QUEUE_EMPTY_SLOT)
-    return;
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.light_ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-  BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  bool is_lamp = kernel_split_state.is_lamp[ray_index];
-
-#if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
-  bool use_branched = false;
-  int all = 0;
-
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    use_branched = true;
-    all = 1;
-  }
-#  if defined(__BRANCHED_PATH__)
-  else if (kernel_data.integrator.branched) {
-    use_branched = true;
-
-    if (IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-      all = (kernel_data.integrator.sample_all_lights_indirect);
-    }
-    else {
-      all = (kernel_data.integrator.sample_all_lights_direct);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-  if (use_branched) {
-    kernel_branched_path_surface_connect_light(
-        kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-  }
-  else
-#endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
-  {
-    /* trace shadow ray */
-    float3 shadow;
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &ray, &shadow)) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
deleted file mode 100644
index 5114f2b03e5..00000000000
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_H__
-#define __KERNEL_SPLIT_H__
-
-// clang-format off
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "kernel/kernels/opencl/kernel_opencl_image.h"
-#endif
-#ifdef __KERNEL_CUDA__
-#  include "kernel/kernels/cuda/kernel_cuda_image.h"
-#endif
-#ifdef __KERNEL_CPU__
-#  include "kernel/kernels/cpu/kernel_cpu_image.h"
-#endif
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_path.h"
-#ifdef __BRANCHED_PATH__
-#  include "kernel/kernel_path_branched.h"
-#endif
-
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
-
-#ifdef __BRANCHED_PATH__
-#  include "kernel/split/kernel_branched.h"
-#endif
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __BRANCHED_PATH__
-#  ifdef __SUBSURFACE__
-  ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-  if (ss_indirect->num_rays) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-  else
-#  endif /* __SUBSURFACE__ */
-      if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
-    int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
-
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_accum_sample(orig_ray_L, L);
-
-    atomic_fetch_and_dec_uint32(
-        (ccl_global uint *)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
-
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
-  }
-  else {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-#else
-  ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
deleted file mode 100644
index decc537b39b..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_H__
-#define __KERNEL_SPLIT_DATA_H__
-
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "kernel/kernel_globals.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
-{
-  (void)kg; /* Unused on CPU. */
-
-  uint64_t size = 0;
-#define SPLIT_DATA_ENTRY(type, name, num) +align_up(num_elements *num * sizeof(type), 16)
-  size = size SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  return size;
-}
-
-ccl_device_inline void split_data_init(KernelGlobals *kg,
-                                       ccl_global SplitData *split_data,
-                                       size_t num_elements,
-                                       ccl_global void *data,
-                                       ccl_global char *ray_state)
-{
-  (void)kg; /* Unused on CPU. */
-
-  ccl_global char *p = (ccl_global char *)data;
-
-#define SPLIT_DATA_ENTRY(type, name, num) \
-  split_data->name = (type *)p; \
-  p += align_up(num_elements * num * sizeof(type), 16);
-  SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  split_data->_branched_state_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  split_data->_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  split_data->ray_state = ray_state;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
deleted file mode 100644
index 06bdce9947d..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
-#define __KERNEL_SPLIT_DATA_TYPES_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* parameters used by the split kernels, we use a single struct to avoid passing these to each
- * kernel */
-
-typedef struct SplitParams {
-  WorkTile tile;
-  uint total_work_size;
-
-  ccl_global unsigned int *work_pools;
-
-  ccl_global int *queue_index;
-  int queue_size;
-  ccl_global char *use_queues_flag;
-
-  /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
-  int dummy_sd_flag;
-} SplitParams;
-
-/* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
-
-/* SPLIT_DATA_ENTRY(type, name, num) */
-
-#ifdef __BRANCHED_PATH__
-
-typedef ccl_global struct SplitBranchedState {
-  /* various state that must be kept and restored after an indirect loop */
-  PathState path_state;
-  float3 throughput;
-  Ray ray;
-
-  Intersection isect;
-
-  char ray_state;
-
-  /* indirect loop state */
-  int next_closure;
-  int next_sample;
-
-#  ifdef __SUBSURFACE__
-  int ss_next_closure;
-  int ss_next_sample;
-  int next_hit;
-  int num_hits;
-
-  uint lcg_state;
-  LocalIntersection ss_isect;
-#  endif /* __SUBSURFACE__ */
-
-  int shared_sample_count; /* number of branched samples shared with other threads */
-  int original_ray;        /* index of original ray when sharing branched samples */
-  bool waiting_on_shared_samples;
-} SplitBranchedState;
-
-#  define SPLIT_DATA_BRANCHED_ENTRIES \
-    SPLIT_DATA_ENTRY(SplitBranchedState, branched_state, 1) \
-    SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0)
-#else
-#  define SPLIT_DATA_BRANCHED_ENTRIES
-#endif /* __BRANCHED_PATH__ */
-
-#ifdef __SUBSURFACE__
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES \
-    SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
-#else
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES
-#endif /* __SUBSURFACE__ */
-
-#ifdef __VOLUME__
-#  define SPLIT_DATA_VOLUME_ENTRIES SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
-#else
-#  define SPLIT_DATA_VOLUME_ENTRIES
-#endif /* __VOLUME__ */
-
-#define SPLIT_DATA_ENTRIES \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY( \
-      ccl_global int, queue_data, (NUM_QUEUES * 2)) /* TODO(mai): this is too large? */ \
-  SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* Entries to be copied to inactive rays when sharing branched samples
- * (TODO: which are actually needed?) */
-#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* struct that holds pointers to data in the shared state buffer */
-typedef struct SplitData {
-#define SPLIT_DATA_ENTRY(type, name, num) type *name;
-  SPLIT_DATA_ENTRIES
-#undef SPLIT_DATA_ENTRY
-
-  /* this is actually in a separate buffer from the rest of the split state data (so it can be read
-   * back from the host easily) but is still used the same as the other data so we have it here in
-   * this struct as well
-   */
-  ccl_global char *ray_state;
-} SplitData;
-
-#ifndef __KERNEL_CUDA__
-#  define kernel_split_state (kg->split_data)
-#  define kernel_split_params (kg->split_param_data)
-#else
-__device__ SplitData __split_data;
-#  define kernel_split_state (__split_data)
-__device__ SplitParams __split_param_data;
-#  define kernel_split_params (__split_param_data)
-#endif /* __KERNEL_CUDA__ */
-
-#define kernel_split_sd(sd, ray_index) \
-  ((ShaderData *)(((ccl_global char *)kernel_split_state._##sd) + \
-                  (sizeof(ShaderData) + \
-                   sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1)) * \
-                      (ray_index)))
-
-/* Local storage for queue_enqueue kernel. */
-typedef struct QueueEnqueueLocals {
-  uint queue_atomics[2];
-} QueueEnqueueLocals;
-
-/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
-typedef struct BackgroundAOLocals {
-  uint queue_atomics_bg;
-  uint queue_atomics_ao;
-} BackgroundAOLocals;
-
-typedef struct ShaderSortLocals {
-  uint local_value[SHADER_SORT_BLOCK_SIZE];
-  ushort local_index[SHADER_SORT_BLOCK_SIZE];
-} ShaderSortLocals;
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
deleted file mode 100644
index ba06ae3bc53..00000000000
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
-
-ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg,
-                                                                                 int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  branched_state->ss_next_closure = 0;
-  branched_state->ss_next_sample = 0;
-
-  branched_state->num_hits = 0;
-  branched_state->next_hit = 0;
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(
-    KernelGlobals *kg, int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  for (int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* set up random number generator */
-    if (branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
-        branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-      branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
-                                                           0x68bc21eb);
-    }
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = branched_state->ss_next_sample; j < num_samples; j++) {
-      ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
-      *hit_state = branched_state->path_state;
-      hit_state->rng_hash = bssrdf_rng_hash;
-      path_state_branch(hit_state, j, num_samples);
-
-      ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_branched_rng_2D(
-          kg, bssrdf_rng_hash, hit_state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-      /* intersection is expensive so avoid doing multiple times for the same input */
-      if (branched_state->next_hit == 0 && branched_state->next_closure == 0 &&
-          branched_state->next_sample == 0) {
-        uint lcg_state = branched_state->lcg_state;
-        LocalIntersection ss_isect_private;
-
-        branched_state->num_hits = subsurface_scatter_multi_intersect(
-            kg, &ss_isect_private, sd, hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-        branched_state->lcg_state = lcg_state;
-        *ss_isect = ss_isect_private;
-      }
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-#  ifdef __VOLUME__
-      Ray volume_ray = branched_state->ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
-        ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
-        *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
-                           * important as the indirect path will write into bssrdf_sd */
-
-        LocalIntersection ss_isect_private = *ss_isect;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect_private, hit, bssrdf_sd, hit_state, bssrdf_type, bssrdf_roughness);
-        *ss_isect = ss_isect_private;
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-
-#  ifdef __EMISSION__
-        if (branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-          /* direct light */
-          if (kernel_data.integrator.use_direct_light) {
-            int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                      (hit_state->flag & PATH_RAY_SHADOW_CATCHER);
-            kernel_branched_path_surface_connect_light(kg,
-                                                       bssrdf_sd,
-                                                       emission_sd,
-                                                       hit_state,
-                                                       branched_state->throughput,
-                                                       num_samples_inv,
-                                                       L,
-                                                       all);
-          }
-        }
-#  endif /* __EMISSION__ */
-
-        /* indirect light */
-        if (kernel_split_branched_path_surface_indirect_light_iter(
-                kg, ray_index, num_samples_inv, bssrdf_sd, false, false)) {
-          branched_state->ss_next_closure = i;
-          branched_state->ss_next_sample = j;
-          branched_state->next_hit = hit;
-
-          return true;
-        }
-
-        branched_state->next_closure = 0;
-      }
-
-      branched_state->next_hit = 0;
-    }
-
-    branched_state->ss_next_sample = 0;
-  }
-
-  branched_state->ss_next_closure = sd->num_closure;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
-
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-
-#ifdef __SUBSURFACE__
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    if (sd->flag & SD_BSSRDF) {
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif
-        if (kernel_path_subsurface_scatter(
-                kg, sd, emission_sd, L, state, ray, throughput, ss_indirect)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
-  }
-
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_SUBSURFACE_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 000da1fa615..4aee1ef11b3 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -48,16 +48,18 @@ ccl_device_inline float3 stack_load_float3(float *stack, uint a)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  return make_float3(stack[a + 0], stack[a + 1], stack[a + 2]);
+  float *stack_a = stack + a;
+  return make_float3(stack_a[0], stack_a[1], stack_a[2]);
 }
 
 ccl_device_inline void stack_store_float3(float *stack, uint a, float3 f)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  stack[a + 0] = f.x;
-  stack[a + 1] = f.y;
-  stack[a + 2] = f.z;
+  float *stack_a = stack + a;
+  stack_a[0] = f.x;
+  stack_a[1] = f.y;
+  stack_a[2] = f.z;
 }
 
 ccl_device_inline float stack_load_float(float *stack, uint a)
@@ -105,14 +107,14 @@ ccl_device_inline bool stack_valid(uint a)
 
 /* Reading Nodes */
 
-ccl_device_inline uint4 read_node(KernelGlobals *kg, int *offset)
+ccl_device_inline uint4 read_node(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   (*offset)++;
   return node;
 }
 
-ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
+ccl_device_inline float4 read_node_float(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   float4 f = make_float4(__uint_as_float(node.x),
@@ -123,7 +125,7 @@ ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
   return f;
 }
 
-ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
+ccl_device_inline float4 fetch_node_float(const KernelGlobals *kg, int offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, offset);
   return make_float4(__uint_as_float(node.x),
@@ -217,26 +219,11 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void svm_eval_nodes(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ccl_addr_space PathState *state,
-                                      ccl_global float *buffer,
-                                      ShaderType type,
-                                      int path_flag)
-{
-  optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag);
-}
-extern "C" __device__ void __direct_callable__svm_eval_nodes(
-#else
-ccl_device_noinline void svm_eval_nodes(
-#endif
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ccl_global float *buffer,
-    ShaderType type,
-    int path_flag)
+template<uint node_feature_mask, ShaderType type>
+ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS,
+                               ShaderData *sd,
+                               ccl_global float *render_buffer,
+                               int path_flag)
 {
   float stack[SVM_STACK_SIZE];
   int offset = sd->shader & SHADER_MASK;
@@ -247,7 +234,6 @@ ccl_device_noinline void svm_eval_nodes(
     switch (node.x) {
       case NODE_END:
         return;
-#if NODES_GROUP(NODE_GROUP_LEVEL_0)
       case NODE_SHADER_JUMP: {
         if (type == SHADER_TYPE_SURFACE)
           offset = node.y;
@@ -260,13 +246,18 @@ ccl_device_noinline void svm_eval_nodes(
         break;
       }
       case NODE_CLOSURE_BSDF:
-        svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset);
+        offset = svm_node_closure_bsdf<node_feature_mask, type>(
+            kg, sd, stack, node, path_flag, offset);
         break;
       case NODE_CLOSURE_EMISSION:
-        svm_node_closure_emission(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_emission(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_BACKGROUND:
-        svm_node_closure_background(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_background(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_SET_WEIGHT:
         svm_node_closure_set_weight(sd, node.y, node.z, node.w);
@@ -275,7 +266,9 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_closure_weight(sd, stack, node.y);
         break;
       case NODE_EMISSION_WEIGHT:
-        svm_node_emission_weight(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_emission_weight(kg, sd, stack, node);
+        }
         break;
       case NODE_MIX_CLOSURE:
         svm_node_mix_closure(sd, stack, node);
@@ -295,86 +288,108 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_TEX_COORD:
-        svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+        offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
         break;
       case NODE_VALUE_F:
         svm_node_value_f(kg, sd, stack, node.y, node.z);
         break;
       case NODE_VALUE_V:
-        svm_node_value_v(kg, sd, stack, node.y, &offset);
+        offset = svm_node_value_v(kg, sd, stack, node.y, offset);
         break;
       case NODE_ATTR:
-        svm_node_attr(kg, sd, stack, node);
+        svm_node_attr<node_feature_mask>(kg, sd, stack, node);
         break;
       case NODE_VERTEX_COLOR:
         svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_GEOMETRY_BUMP_DX:
-        svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_GEOMETRY_BUMP_DY:
-        svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_SET_DISPLACEMENT:
-        svm_node_set_displacement(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_displacement(kg, sd, stack, node.y);
+        }
         break;
       case NODE_DISPLACEMENT:
-        svm_node_displacement(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_displacement(kg, sd, stack, node);
+        }
         break;
       case NODE_VECTOR_DISPLACEMENT:
-        svm_node_vector_displacement(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_vector_displacement(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_TEX_IMAGE:
-        svm_node_tex_image(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_image(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_IMAGE_BOX:
         svm_node_tex_image_box(kg, sd, stack, node);
         break;
       case NODE_TEX_NOISE:
-        svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_SET_BUMP:
-        svm_node_set_bump(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_bump(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DX:
-        svm_node_attr_bump_dx(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dx(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DY:
-        svm_node_attr_bump_dy(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dy(kg, sd, stack, node);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DX:
-        svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DY:
-        svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DX:
-        svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DY:
-        svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_CLOSURE_SET_NORMAL:
-        svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        }
         break;
-#    if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
       case NODE_ENTER_BUMP_EVAL:
-        svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        }
         break;
       case NODE_LEAVE_BUMP_EVAL:
-        svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        }
         break;
-#    endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
-#  endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_HSV:
-        svm_node_hsv(kg, sd, stack, node, &offset);
+        svm_node_hsv(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_1)
       case NODE_CLOSURE_HOLDOUT:
         svm_node_closure_holdout(sd, stack, node);
         break;
@@ -384,22 +399,24 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_LAYER_WEIGHT:
         svm_node_layer_weight(sd, stack, node);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_CLOSURE_VOLUME:
-        svm_node_closure_volume(kg, sd, stack, node, type);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          svm_node_closure_volume<type>(kg, sd, stack, node);
+        }
         break;
       case NODE_PRINCIPLED_VOLUME:
-        svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_MATH:
-        svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_math(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_VECTOR_MATH:
-        svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_RGB_RAMP:
-        svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+        offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
         break;
       case NODE_GAMMA:
         svm_node_gamma(sd, stack, node.y, node.z, node.w);
@@ -408,7 +425,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_brightness(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_LIGHT_PATH:
-        svm_node_light_path(sd, state, stack, node.y, node.z, path_flag);
+        svm_node_light_path(INTEGRATOR_STATE_PASS, sd, stack, node.y, node.z, path_flag);
         break;
       case NODE_OBJECT_INFO:
         svm_node_object_info(kg, sd, stack, node.y, node.z);
@@ -416,22 +433,22 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_PARTICLE_INFO:
         svm_node_particle_info(kg, sd, stack, node.y, node.z);
         break;
-#  if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR)
+#if defined(__HAIR__)
       case NODE_HAIR_INFO:
-        svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(HAIR)) {
+          svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_2)
       case NODE_TEXTURE_MAPPING:
-        svm_node_texture_mapping(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_MAPPING:
-        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIN_MAX:
-        svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_CAMERA:
         svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
@@ -440,47 +457,46 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_tex_environment(kg, sd, stack, node);
         break;
       case NODE_TEX_SKY:
-        svm_node_tex_sky(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_sky(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_GRADIENT:
         svm_node_tex_gradient(sd, stack, node);
         break;
       case NODE_TEX_VORONOI:
-        svm_node_tex_voronoi(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_voronoi<node_feature_mask>(
+            kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_MUSGRAVE:
-        svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_WAVE:
-        svm_node_tex_wave(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_wave(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_MAGIC:
-        svm_node_tex_magic(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_magic(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_CHECKER:
         svm_node_tex_checker(kg, sd, stack, node);
         break;
       case NODE_TEX_BRICK:
-        svm_node_tex_brick(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_brick(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_WHITE_NOISE:
-        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_NORMAL:
-        svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_LIGHT_FALLOFF:
         svm_node_light_falloff(sd, stack, node);
         break;
       case NODE_IES:
-        svm_node_ies(kg, sd, stack, node, &offset);
+        svm_node_ies(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_3)
       case NODE_RGB_CURVES:
       case NODE_VECTOR_CURVES:
-        svm_node_curves(kg, sd, stack, node, &offset);
+        offset = svm_node_curves(kg, sd, stack, node, offset);
         break;
       case NODE_TANGENT:
         svm_node_tangent(kg, sd, stack, node);
@@ -492,7 +508,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_invert(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIX:
-        svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_SEPARATE_VECTOR:
         svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
@@ -501,10 +517,10 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_SEPARATE_HSV:
-        svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_COMBINE_HSV:
-        svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_VECTOR_ROTATE:
         svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
@@ -522,39 +538,36 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_blackbody(kg, sd, stack, node.y, node.z);
         break;
       case NODE_MAP_RANGE:
-        svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_CLAMP:
-        svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  ifdef __SHADER_RAYTRACE__
+#ifdef __SHADER_RAYTRACE__
       case NODE_BEVEL:
-        svm_node_bevel(kg, sd, state, stack, node);
+        svm_node_bevel<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
       case NODE_AMBIENT_OCCLUSION:
-        svm_node_ao(kg, sd, state, stack, node);
+        svm_node_ao<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
-#  endif /* __SHADER_RAYTRACE__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_4)
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_TEX_VOXEL:
-        svm_node_tex_voxel(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_AOV_START:
-        if (!svm_node_aov_check(state, buffer)) {
+        if (!svm_node_aov_check(path_flag, render_buffer)) {
           return;
         }
         break;
       case NODE_AOV_COLOR:
-        svm_node_aov_color(kg, sd, stack, node, buffer);
+        svm_node_aov_color(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
       case NODE_AOV_VALUE:
-        svm_node_aov_value(kg, sd, stack, node, buffer);
+        svm_node_aov_value(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */
       default:
         kernel_assert(!"Unknown node type was passed to the SVM machine");
         return;
diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h
index 4cb986b897a..34ac2cb8fbf 100644
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@@ -14,20 +14,25 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
-ccl_device_noinline float svm_ao(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float3 N,
-                                 ccl_addr_space PathState *state,
-                                 float max_dist,
-                                 int num_samples,
-                                 int flags)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float __direct_callable__svm_node_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float svm_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                           ShaderData *sd,
+                                                           float3 N,
+                                                           float max_dist,
+                                                           int num_samples,
+                                                           int flags)
 {
   if (flags & NODE_AO_GLOBAL_RADIUS) {
-    max_dist = kernel_data.background.ao_distance;
+    max_dist = kernel_data.integrator.ao_bounces_distance;
   }
 
   /* Early out if no sampling needed. */
@@ -47,11 +52,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   float3 T, B;
   make_orthonormals(N, &T, &B);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   int unoccluded = 0;
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     float2 d = concentric_sample_disk(disk_u, disk_v);
     float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d)));
@@ -62,8 +70,8 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
     ray.D = D.x * T + D.y * B + D.z * N;
     ray.t = max_dist;
     ray.time = sd->time;
-    ray.dP = sd->dP;
-    ray.dD = differential3_zero();
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
 
     if (flags & NODE_AO_ONLY_LOCAL) {
       if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) {
@@ -81,8 +89,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   return ((float)unoccluded) / num_samples;
 }
 
-ccl_device void svm_node_ao(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_ao(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint flags, dist_offset, normal_offset, out_ao_offset;
   svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
@@ -92,7 +106,16 @@ ccl_device void svm_node_ao(
 
   float dist = stack_load_float_default(stack, dist_offset, node.w);
   float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
-  float ao = svm_ao(kg, sd, normal, state, dist, samples, flags);
+
+  float ao = 1.0f;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    ao = optixDirectCall<float>(0, INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  else
+    ao = svm_ao(INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  endif
+  }
 
   if (stack_valid(out_ao_offset)) {
     stack_store_float(stack, out_ao_offset, ao);
diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h
index 899e466d099..26dec9717b3 100644
--- a/intern/cycles/kernel/svm/svm_aov.h
+++ b/intern/cycles/kernel/svm/svm_aov.h
@@ -14,36 +14,50 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool svm_node_aov_check(ccl_addr_space PathState *state,
-                                          ccl_global float *buffer)
+ccl_device_inline bool svm_node_aov_check(const int path_flag, ccl_global float *render_buffer)
 {
-  int path_flag = state->flag;
-
   bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));
 
-  return ((buffer != NULL) && is_primary);
+  return ((render_buffer != NULL) && is_primary);
 }
 
-ccl_device void svm_node_aov_color(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_color(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float3 val = stack_load_float3(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float4(buffer + kernel_data.film.pass_aov_color + 4 * node.z,
-                             make_float4(val.x, val.y, val.z, 1.0f));
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_color + node.z);
+    kernel_write_pass_float3(buffer, make_float3(val.x, val.y, val.z));
   }
 }
 
-ccl_device void svm_node_aov_value(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_value(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float val = stack_load_float(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float(buffer + kernel_data.film.pass_aov_value + node.z, val);
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_value + node.z);
+    kernel_write_pass_float(buffer, val);
   }
 }
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 62740824ad1..5f94b20af73 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -18,8 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute Node */
 
-ccl_device AttributeDescriptor svm_node_attr_init(
-    KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeOutputType *type, uint *out_offset)
+ccl_device AttributeDescriptor svm_node_attr_init(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  uint4 node,
+                                                  NodeAttributeOutputType *type,
+                                                  uint *out_offset)
 {
   *out_offset = node.z;
   *type = (NodeAttributeOutputType)node.w;
@@ -44,31 +47,37 @@ ccl_device AttributeDescriptor svm_node_attr_init(
   return desc;
 }
 
-ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+template<uint node_feature_mask>
+ccl_device_noinline void svm_node_attr(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
   AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
 #ifdef __VOLUME__
-  /* Volumes
-   * NOTE: moving this into its own node type might help improve performance. */
-  if (primitive_is_volume_attribute(sd, desc)) {
-    const float4 value = volume_attribute_float4(kg, sd, desc);
+  if (KERNEL_NODES_FEATURE(VOLUME)) {
+    /* Volumes
+     * NOTE: moving this into its own node type might help improve performance. */
+    if (primitive_is_volume_attribute(sd, desc)) {
+      const float4 value = volume_attribute_float4(kg, sd, desc);
 
-    if (type == NODE_ATTR_OUTPUT_FLOAT) {
-      const float f = volume_attribute_value_to_float(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
-      const float3 f = volume_attribute_value_to_float3(value);
-      stack_store_float3(stack, out_offset, f);
+      if (type == NODE_ATTR_OUTPUT_FLOAT) {
+        const float f = volume_attribute_value_to_float(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
+        const float3 f = volume_attribute_value_to_float3(value);
+        stack_store_float3(stack, out_offset, f);
+      }
+      else {
+        const float f = volume_attribute_value_to_alpha(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      return;
     }
-    else {
-      const float f = volume_attribute_value_to_alpha(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    return;
   }
 #endif
 
@@ -139,7 +148,10 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
   }
 }
 
-ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dx(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
@@ -232,7 +244,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *
   }
 }
 
-ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dy(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index bf5957ec9e4..aab089d19ea 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -14,21 +14,95 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
+/* Planar Cubic BSSRDF falloff, reused for bevel.
+ *
+ * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
+ * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
+ * far as I can tell has no closed form solution. So we get an iterative solution
+ * instead with newton-raphson. */
+
+ccl_device float svm_bevel_cubic_eval(const float radius, float r)
+{
+  const float Rm = radius;
+
+  if (r >= Rm)
+    return 0.0f;
+
+  /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
+  const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
+  const float f = Rm - r;
+  const float num = f * f * f;
+
+  return (10.0f * num) / (Rm5 * M_PI_F);
+}
+
+ccl_device float svm_bevel_cubic_pdf(const float radius, float r)
+{
+  return svm_bevel_cubic_eval(radius, r);
+}
+
+/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
+ccl_device_forceinline float svm_bevel_cubic_quintic_root_find(float xi)
+{
+  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
+   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
+   * should not be too bad */
+  const float tolerance = 1e-6f;
+  const int max_iteration_count = 10;
+  float x = 0.25f;
+  int i;
+
+  for (i = 0; i < max_iteration_count; i++) {
+    float x2 = x * x;
+    float x3 = x2 * x;
+    float nx = (1.0f - x);
+
+    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
+    float f_ = 20.0f * (x * nx) * (nx * nx);
+
+    if (fabsf(f) < tolerance || f_ == 0.0f)
+      break;
+
+    x = saturate(x - f / f_);
+  }
+
+  return x;
+}
+
+ccl_device void svm_bevel_cubic_sample(const float radius, float xi, float *r, float *h)
+{
+  float Rm = radius;
+  float r_ = svm_bevel_cubic_quintic_root_find(xi);
+
+  r_ *= Rm;
+  *r = r_;
+
+  /* h^2 + r^2 = Rm^2 */
+  *h = safe_sqrtf(Rm * Rm - r_ * r_);
+}
+
 /* Bevel shader averaging normals from nearby surfaces.
  *
  * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013
  * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
  */
 
-ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ccl_addr_space PathState *state,
-                                     float radius,
-                                     int num_samples)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float3 __direct_callable__svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float3 svm_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                               ShaderData *sd,
+                                                               float radius,
+                                                               int num_samples)
 {
   /* Early out if no sampling needed. */
   if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
@@ -41,21 +115,27 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   }
 
   /* Don't bevel for blurry indirect rays. */
-  if (state->min_ray_pdf < 8.0f) {
+  if (INTEGRATOR_STATE(path, min_ray_pdf) < 8.0f) {
     return sd->N;
   }
 
   /* Setup for multi intersection. */
   LocalIntersection isect;
-  uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e);
+  uint lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                  INTEGRATOR_STATE(path, rng_offset),
+                                  INTEGRATOR_STATE(path, sample),
+                                  0x64c6a40e);
 
   /* Sample normals from surrounding points on surface. */
   float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     /* Pick random axis in local frame and point on disk. */
     float3 disk_N, disk_T, disk_B;
@@ -97,7 +177,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     float disk_height;
 
     /* Perhaps find something better than Cubic BSSRDF, but happens to work well. */
-    bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height);
+    svm_bevel_cubic_sample(radius, disk_r, &disk_r, &disk_height);
 
     float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
 
@@ -106,8 +186,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     ray->P = sd->P + disk_N * disk_height + disk_P;
     ray->D = -disk_N;
     ray->t = 2.0f * disk_height;
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
+    ray->dP = differential_zero_compact();
+    ray->dD = differential_zero_compact();
     ray->time = sd->time;
 
     /* Intersect with the same object. if multiple intersections are found it
@@ -120,14 +200,16 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       /* Quickly retrieve P and Ng without setting up ShaderData. */
       float3 hit_P;
       if (sd->type & PRIMITIVE_TRIANGLE) {
-        hit_P = triangle_refine_local(kg, sd, &isect.hits[hit], ray);
+        hit_P = triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim);
       }
 #  ifdef __OBJECT_MOTION__
       else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
         float3 verts[3];
         motion_triangle_vertices(
             kg, sd->object, kernel_tex_fetch(__prim_index, isect.hits[hit].prim), sd->time, verts);
-        hit_P = motion_triangle_refine_local(kg, sd, &isect.hits[hit], ray, verts);
+        hit_P = motion_triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim, verts);
       }
 #  endif /* __OBJECT_MOTION__ */
 
@@ -183,8 +265,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       float r = len(hit_P - sd->P);
 
       /* Compute weight. */
-      float pdf = bssrdf_cubic_pdf(radius, 0.0f, r);
-      float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r);
+      float pdf = svm_bevel_cubic_pdf(radius, r);
+      float disk_pdf = svm_bevel_cubic_pdf(radius, disk_r);
 
       w *= pdf / disk_pdf;
 
@@ -198,19 +280,34 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
 }
 
-ccl_device void svm_node_bevel(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint num_samples, radius_offset, normal_offset, out_offset;
   svm_unpack_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
 
   float radius = stack_load_float(stack, radius_offset);
-  float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples);
 
-  if (stack_valid(normal_offset)) {
-    /* Preserve input normal. */
-    float3 ref_N = stack_load_float3(stack, normal_offset);
-    bevel_N = normalize(ref_N + (bevel_N - sd->N));
+  float3 bevel_N = sd->N;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    bevel_N = optixDirectCall<float3>(1, INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  else
+    bevel_N = svm_bevel(INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  endif
+
+    if (stack_valid(normal_offset)) {
+      /* Preserve input normal. */
+      float3 ref_N = stack_load_float3(stack, normal_offset);
+      bevel_N = normalize(ref_N + (bevel_N - sd->N));
+    }
   }
 
   stack_store_float3(stack, out_offset, bevel_N);
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index adfc50d961e..96b3703b954 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -34,8 +34,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Blackbody Node */
 
-ccl_device void svm_node_blackbody(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
+ccl_device_noinline void svm_node_blackbody(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint temperature_offset,
+                                            uint col_offset)
 {
   /* Input */
   float temperature = stack_load_float(stack, temperature_offset);
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 6984afa30a5..dca1b220dd5 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -72,12 +72,12 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
   return make_float2(tint, mortar);
 }
 
-ccl_device void svm_node_tex_brick(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_brick(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
-  uint4 node4 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
+  uint4 node4 = read_node(kg, &offset);
 
   /* Input and Output Sockets */
   uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset;
@@ -133,6 +133,7 @@ ccl_device void svm_node_tex_brick(
     stack_store_float3(stack, color_offset, color1 * (1.0f - f) + mortar * f);
   if (stack_valid(fac_offset))
     stack_store_float(stack, fac_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 9554b5946fb..2ed812acd71 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_brightness(
+ccl_device_noinline void svm_node_brightness(
     ShaderData *sd, float *stack, uint in_color, uint out_color, uint node)
 {
   uint bright_offset, contrast_offset;
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index c9d430a2bba..8672839dbab 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Bump Eval Nodes */
 
-ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_enter_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* save state */
   stack_store_float3(stack, offset + 0, sd->P);
@@ -45,10 +45,10 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
   }
 }
 
-ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_leave_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* restore state */
   sd->P = stack_load_float3(stack, offset + 0);
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 21a17acf5f1..40c0edcdad0 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_camera(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint out_vector,
-                                uint out_zdepth,
-                                uint out_distance)
+ccl_device_noinline void svm_node_camera(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint out_vector,
+                                         uint out_zdepth,
+                                         uint out_distance)
 {
   float distance;
   float zdepth;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index d54cb73df91..a9919c9ddc9 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -32,7 +32,10 @@ ccl_device float svm_checker(float3 p)
   return ((xi % 2 == yi % 2) == (zi % 2)) ? 1.0f : 0.0f;
 }
 
-ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_checker(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint4 node)
 {
   uint co_offset, color1_offset, color2_offset, scale_offset;
   uint color_offset, fac_offset;
diff --git a/intern/cycles/kernel/svm/svm_clamp.h b/intern/cycles/kernel/svm/svm_clamp.h
index a85fd82754e..656bd31c085 100644
--- a/intern/cycles/kernel/svm/svm_clamp.h
+++ b/intern/cycles/kernel/svm/svm_clamp.h
@@ -18,18 +18,18 @@ CCL_NAMESPACE_BEGIN
 
 /* Clamp Node */
 
-ccl_device void svm_node_clamp(KernelGlobals *kg,
-                               ShaderData *sd,
-                               float *stack,
-                               uint value_stack_offset,
-                               uint parameters_stack_offsets,
-                               uint result_stack_offset,
-                               int *offset)
+ccl_device_noinline int svm_node_clamp(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint value_stack_offset,
+                                       uint parameters_stack_offsets,
+                                       uint result_stack_offset,
+                                       int offset)
 {
   uint min_stack_offset, max_stack_offset, type;
   svm_unpack_node_uchar3(parameters_stack_offsets, &min_stack_offset, &max_stack_offset, &type);
 
-  uint4 defaults = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float min = stack_load_float_default(stack, min_stack_offset, defaults.x);
@@ -41,6 +41,7 @@ ccl_device void svm_node_clamp(KernelGlobals *kg,
   else {
     stack_store_float(stack, result_stack_offset, clamp(value, min, max));
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index bbe8d72edf0..e2f6dde4ace 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -57,13 +57,9 @@ ccl_device void svm_node_glass_setup(
   }
 }
 
-ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint4 node,
-                                      ShaderType shader_type,
-                                      int path_flag,
-                                      int *offset)
+template<uint node_feature_mask, ShaderType shader_type>
+ccl_device_noinline int svm_node_closure_bsdf(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
   uint type, param1_offset, param2_offset;
 
@@ -73,19 +69,19 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                                        1.0f);
 
   /* note we read this extra node before weight check, so offset is added */
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
 
   /* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */
-  if (mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) {
+  if ((!KERNEL_NODES_FEATURE(BSDF) || shader_type != SHADER_TYPE_SURFACE) || mix_weight == 0.0f) {
     if (type == CLOSURE_BSDF_PRINCIPLED_ID) {
       /* Read all principled BSDF extra data to get the right offset. */
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
     }
 
-    return;
+    return offset;
   }
 
   float3 N = stack_valid(data_node.x) ? stack_load_float3(stack, data_node.x) : sd->N;
@@ -102,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
           sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset,
           eta_offset, transmission_offset, anisotropic_rotation_offset,
           transmission_roughness_offset;
-      uint4 data_node2 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
 
       float3 T = stack_load_float3(stack, data_node.y);
       svm_unpack_node_uchar4(data_node.z,
@@ -158,7 +154,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       float specular_weight = (1.0f - final_transmission);
 
       // get the base color
-      uint4 data_base_color = read_node(kg, offset);
+      uint4 data_base_color = read_node(kg, &offset);
       float3 base_color = stack_valid(data_base_color.x) ?
                               stack_load_float3(stack, data_base_color.x) :
                               make_float3(__uint_as_float(data_base_color.y),
@@ -166,16 +162,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                           __uint_as_float(data_base_color.w));
 
       // get the additional clearcoat normal and subsurface scattering radius
-      uint4 data_cn_ssr = read_node(kg, offset);
+      uint4 data_cn_ssr = read_node(kg, &offset);
       float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ?
                                     stack_load_float3(stack, data_cn_ssr.x) :
                                     sd->N;
       float3 subsurface_radius = stack_valid(data_cn_ssr.y) ?
                                      stack_load_float3(stack, data_cn_ssr.y) :
                                      make_float3(1.0f, 1.0f, 1.0f);
+      float subsurface_ior = stack_valid(data_cn_ssr.z) ? stack_load_float(stack, data_cn_ssr.z) :
+                                                          1.4f;
+      float subsurface_anisotropy = stack_valid(data_cn_ssr.w) ?
+                                        stack_load_float(stack, data_cn_ssr.w) :
+                                        0.0f;
 
       // get the subsurface color
-      uint4 data_subsurface_color = read_node(kg, offset);
+      uint4 data_subsurface_color = read_node(kg, &offset);
       float3 subsurface_color = stack_valid(data_subsurface_color.x) ?
                                     stack_load_float3(stack, data_subsurface_color.x) :
                                     make_float3(__uint_as_float(data_subsurface_color.y),
@@ -222,16 +223,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
           if (bssrdf) {
             bssrdf->radius = subsurface_radius * subsurface;
-            bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID) ?
-                                 subsurface_color :
-                                 mixed_ss_base_color;
-            bssrdf->texture_blur = 0.0f;
-            bssrdf->sharpness = 0.0f;
+            bssrdf->albedo = mixed_ss_base_color;
             bssrdf->N = N;
             bssrdf->roughness = roughness;
 
+            /* Clamps protecting against bad/extreme and non physical values. */
+            subsurface_ior = clamp(subsurface_ior, 1.01f, 3.8f);
+            bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
             /* setup bsdf */
-            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method);
+            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method, subsurface_ior);
           }
         }
       }
@@ -733,9 +734,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     }
 #ifdef __HAIR__
     case CLOSURE_BSDF_HAIR_PRINCIPLED_ID: {
-      uint4 data_node2 = read_node(kg, offset);
-      uint4 data_node3 = read_node(kg, offset);
-      uint4 data_node4 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
+      uint4 data_node3 = read_node(kg, &offset);
+      uint4 data_node4 = read_node(kg, &offset);
 
       float3 weight = sd->svm_closure_weight * mix_weight;
 
@@ -878,10 +879,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 #endif /* __HAIR__ */
 
 #ifdef __SUBSURFACE__
-    case CLOSURE_BSSRDF_CUBIC_ID:
-    case CLOSURE_BSSRDF_GAUSSIAN_ID:
-    case CLOSURE_BSSRDF_BURLEY_ID:
-    case CLOSURE_BSSRDF_RANDOM_WALK_ID: {
+    case CLOSURE_BSSRDF_RANDOM_WALK_ID:
+    case CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID: {
       float3 weight = sd->svm_closure_weight * mix_weight;
       Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
 
@@ -894,11 +893,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
         bssrdf->radius = stack_load_float3(stack, data_node.z) * param1;
         bssrdf->albedo = sd->svm_closure_weight;
-        bssrdf->texture_blur = param2;
-        bssrdf->sharpness = stack_load_float(stack, data_node.w);
         bssrdf->N = N;
-        bssrdf->roughness = 0.0f;
-        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+        bssrdf->roughness = FLT_MAX;
+
+        const float subsurface_ior = clamp(param2, 1.01f, 3.8f);
+        const float subsurface_anisotropy = stack_load_float(stack, data_node.w);
+        bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
+        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, subsurface_ior);
       }
 
       break;
@@ -907,10 +909,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     default:
       break;
   }
+
+  return offset;
 }
 
-ccl_device void svm_node_closure_volume(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type)
+template<ShaderType shader_type>
+ccl_device_noinline void svm_node_closure_volume(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint4 node)
 {
 #ifdef __VOLUME__
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
@@ -961,21 +968,17 @@ ccl_device void svm_node_closure_volume(
 #endif
 }
 
-ccl_device void svm_node_principled_volume(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           float *stack,
-                                           uint4 node,
-                                           ShaderType shader_type,
-                                           int path_flag,
-                                           int *offset)
+template<ShaderType shader_type>
+ccl_device_noinline int svm_node_principled_volume(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
 #ifdef __VOLUME__
-  uint4 value_node = read_node(kg, offset);
-  uint4 attr_node = read_node(kg, offset);
+  uint4 value_node = read_node(kg, &offset);
+  uint4 attr_node = read_node(kg, &offset);
 
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
   if (shader_type != SHADER_TYPE_VOLUME) {
-    return;
+    return offset;
   }
 
   uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset;
@@ -985,7 +988,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
                                                        1.0f);
 
   if (mix_weight == 0.0f) {
-    return;
+    return offset;
   }
 
   /* Compute density. */
@@ -1034,7 +1037,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
   /* Compute emission. */
   if (path_flag & PATH_RAY_SHADOW) {
     /* Don't need emission for shadows. */
-    return;
+    return offset;
   }
 
   uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset;
@@ -1074,9 +1077,10 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
     }
   }
 #endif
+  return offset;
 }
 
-ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1093,7 +1097,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
   emission_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1110,7 +1114,7 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
   background_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
 
@@ -1145,14 +1149,13 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint
 ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
 {
   float3 weight = stack_load_float3(stack, weight_offset);
-
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_emission_weight(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_emission_weight(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint color_offset = node.y;
   uint strength_offset = node.z;
@@ -1163,7 +1166,7 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg,
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 {
   /* fetch weight from blend input, previous mix closures,
    * and write to stack to be used by closure nodes later */
@@ -1186,7 +1189,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 /* (Bump) normal */
 
 ccl_device void svm_node_set_normal(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
   float3 normal = stack_load_float3(stack, in_direction);
   sd->N = normal;
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 5df6c9fb755..37d40167ccc 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Conversion Nodes */
 
-ccl_device void svm_node_convert(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
+ccl_device_noinline void svm_node_convert(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
 {
   switch (type) {
     case NODE_CONVERT_FI: {
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 250fac6bcb8..a1d952173d8 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -14,11 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Bump Node */
 
-ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_set_bump(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint4 node)
 {
 #ifdef __RAY_DIFFERENTIALS__
   /* get normal input */
@@ -83,7 +88,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 
 /* Displacement Node */
 
-ccl_device void svm_node_set_displacement(KernelGlobals *kg,
+ccl_device void svm_node_set_displacement(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           float *stack,
                                           uint fac_offset)
@@ -92,7 +97,10 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg,
   sd->P += dP;
 }
 
-ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_displacement(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   uint height_offset, midlevel_offset, scale_offset, normal_offset;
   svm_unpack_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
@@ -119,10 +127,10 @@ ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *
   stack_store_float3(stack, node.z, dP);
 }
 
-ccl_device void svm_node_vector_displacement(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_vector_displacement(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
   uint space = data_node.x;
 
   uint vector_offset, midlevel_offset, scale_offset, displacement_offset;
@@ -164,6 +172,7 @@ ccl_device void svm_node_vector_displacement(
   }
 
   stack_store_float3(stack, displacement_offset, dP);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 96d602e35bf..b5ecdbe2abf 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Fresnel Node */
 
-ccl_device void svm_node_fresnel(
+ccl_device_noinline void svm_node_fresnel(
     ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
 {
   uint normal_offset, out_offset;
@@ -37,7 +37,7 @@ ccl_device void svm_node_fresnel(
 
 /* Layer Weight Node */
 
-ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 {
   uint blend_offset = node.y;
   uint blend_value = node.z;
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index 65eb08eb0eb..f6fafdee941 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_gamma(
+ccl_device_noinline void svm_node_gamma(
     ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color)
 {
   float3 color = stack_load_float3(stack, in_color);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index e48e96dcfa4..10e9f291d0e 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Geometry Node */
 
-ccl_device_inline void svm_node_geometry(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float3 data;
 
@@ -51,8 +51,8 @@ ccl_device_inline void svm_node_geometry(
   stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device void svm_node_geometry_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -75,8 +75,8 @@ ccl_device void svm_node_geometry_bump_dx(
 #endif
 }
 
-ccl_device void svm_node_geometry_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -101,8 +101,8 @@ ccl_device void svm_node_geometry_bump_dy(
 
 /* Object Info */
 
-ccl_device void svm_node_object_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_object_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
 
@@ -140,8 +140,8 @@ ccl_device void svm_node_object_info(
 
 /* Particle Info */
 
-ccl_device void svm_node_particle_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_particle_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   switch (type) {
     case NODE_INFO_PAR_INDEX: {
@@ -199,8 +199,8 @@ ccl_device void svm_node_particle_info(
 
 /* Hair Info */
 
-ccl_device void svm_node_hair_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_hair_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
   float3 data3;
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index 08304bc47e8..cd15f7097e7 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -60,7 +60,7 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type)
   return 0.0f;
 }
 
-ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 {
   uint type, co_offset, color_offset, fac_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index c299cf58c7f..6f49a8385aa 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -19,8 +19,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_hsv(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_hsv(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint in_color_offset, fac_offset, out_color_offset;
   uint hue_offset, sat_offset, val_offset;
diff --git a/intern/cycles/kernel/svm/svm_ies.h b/intern/cycles/kernel/svm/svm_ies.h
index 56c804b44d0..9c13734ecf0 100644
--- a/intern/cycles/kernel/svm/svm_ies.h
+++ b/intern/cycles/kernel/svm/svm_ies.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 /* IES Light */
 
 ccl_device_inline float interpolate_ies_vertical(
-    KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
+    const KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
 {
   /* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end
    * of v (corresponding to the north pole) would result in artifacts. The proper way of dealing
@@ -39,7 +39,7 @@ ccl_device_inline float interpolate_ies_vertical(
   return cubic_interp(a, b, c, d, v_frac);
 }
 
-ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
+ccl_device_inline float kernel_ies_interp(const KernelGlobals *kg,
                                           int slot,
                                           float h_angle,
                                           float v_angle)
@@ -98,8 +98,10 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
   return max(cubic_interp(a, b, c, d, h_frac), 0.0f);
 }
 
-ccl_device void svm_node_ies(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_ies(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint vector_offset, strength_offset, fac_offset, slot = node.z;
   svm_unpack_node_uchar3(node.y, &strength_offset, &vector_offset, &fac_offset);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 9348ddabde5..a344f36977a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags)
+ccl_device float4 svm_image_texture(const KernelGlobals *kg, int id, float x, float y, uint flags)
 {
   if (id == -1) {
     return make_float4(
@@ -44,8 +44,8 @@ ccl_device_inline float3 texco_remap_square(float3 co)
   return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
 }
 
-ccl_device void svm_node_tex_image(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_image(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, out_offset, alpha_offset, flags;
 
@@ -71,7 +71,7 @@ ccl_device void svm_node_tex_image(
   int num_nodes = (int)node.y;
   if (num_nodes > 0) {
     /* Remember the offset of the node following the tile nodes. */
-    int next_offset = (*offset) + num_nodes;
+    int next_offset = offset + num_nodes;
 
     /* Find the tile that the UV lies in. */
     int tx = (int)tex_co.x;
@@ -83,7 +83,7 @@ ccl_device void svm_node_tex_image(
 
       /* Find the index of the tile. */
       for (int i = 0; i < num_nodes; i++) {
-        uint4 tile_node = read_node(kg, offset);
+        uint4 tile_node = read_node(kg, &offset);
         if (tile_node.x == tile) {
           id = tile_node.y;
           break;
@@ -102,7 +102,7 @@ ccl_device void svm_node_tex_image(
     }
 
     /* Skip over the remaining nodes. */
-    *offset = next_offset;
+    offset = next_offset;
   }
   else {
     id = -num_nodes;
@@ -114,9 +114,13 @@ ccl_device void svm_node_tex_image(
     stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, f.w);
+  return offset;
 }
 
-ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_image_box(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float *stack,
+                                                uint4 node)
 {
   /* get object space normal */
   float3 N = sd->N;
@@ -215,10 +219,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
     stack_store_float(stack, alpha_offset, f.w);
 }
 
-ccl_device void svm_node_tex_environment(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_tex_environment(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint id = node.y;
   uint co_offset, out_offset, alpha_offset, flags;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index 02024742b13..27cdaaff473 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -21,7 +21,7 @@ ccl_device float invert(float color, float factor)
   return factor * (1.0f - color) + (1.0f - factor) * color;
 }
 
-ccl_device void svm_node_invert(
+ccl_device_noinline void svm_node_invert(
     ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color)
 {
   float factor = stack_load_float(stack, in_fac);
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 768c65918cd..49fabad1cc5 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Light Path Node */
 
-ccl_device void svm_node_light_path(ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    float *stack,
-                                    uint type,
-                                    uint out_offset,
-                                    int path_flag)
+ccl_device_noinline void svm_node_light_path(INTEGRATOR_STATE_CONST_ARGS,
+                                             const ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint out_offset,
+                                             int path_flag)
 {
   float info = 0.0f;
 
@@ -58,21 +58,47 @@ ccl_device void svm_node_light_path(ShaderData *sd,
     case NODE_LP_ray_length:
       info = sd->ray_length;
       break;
-    case NODE_LP_ray_depth:
-      info = (float)state->bounce;
+    case NODE_LP_ray_depth: {
+      /* Read bounce from difference location depending if this is a shadow
+       * path. It's a bit dubious to have integrate state details leak into
+       * this function but hard to avoid currently. */
+      int bounce = (INTEGRATOR_STATE_IS_NULL)    ? 0 :
+                   (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(shadow_path, bounce) :
+                                                   INTEGRATOR_STATE(path, bounce);
+
+      /* For background, light emission and shadow evaluation we from a
+       * surface or volume we are effective one bounce further. */
+      if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+        bounce++;
+      }
+
+      info = (float)bounce;
       break;
+    }
+      /* TODO */
+    case NODE_LP_ray_transparent: {
+      const int bounce = (INTEGRATOR_STATE_IS_NULL) ?
+                             0 :
+                         (path_flag & PATH_RAY_SHADOW) ?
+                             INTEGRATOR_STATE(shadow_path, transparent_bounce) :
+                             INTEGRATOR_STATE(path, transparent_bounce);
+
+      info = (float)bounce;
+      break;
+    }
+#if 0
     case NODE_LP_ray_diffuse:
       info = (float)state->diffuse_bounce;
       break;
     case NODE_LP_ray_glossy:
       info = (float)state->glossy_bounce;
       break;
-    case NODE_LP_ray_transparent:
-      info = (float)state->transparent_bounce;
-      break;
+#endif
+#if 0
     case NODE_LP_ray_transmission:
       info = (float)state->transmission_bounce;
       break;
+#endif
   }
 
   stack_store_float(stack, out_offset, info);
@@ -80,7 +106,7 @@ ccl_device void svm_node_light_path(ShaderData *sd,
 
 /* Light Falloff Node */
 
-ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 {
   uint strength_offset, out_offset, smooth_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index 9c160e6d8cc..8784c760860 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -87,8 +87,8 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
   return make_float3(0.5f - x, 0.5f - y, 0.5f - z);
 }
 
-ccl_device void svm_node_tex_magic(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_magic(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint depth;
   uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset;
@@ -96,7 +96,7 @@ ccl_device void svm_node_tex_magic(
   svm_unpack_node_uchar3(node.y, &depth, &color_offset, &fac_offset);
   svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset);
 
-  uint4 node2 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
   float3 co = stack_load_float3(stack, co_offset);
   float scale = stack_load_float_default(stack, scale_offset, node2.x);
   float distortion = stack_load_float_default(stack, distortion_offset, node2.y);
@@ -107,6 +107,7 @@ ccl_device void svm_node_tex_magic(
     stack_store_float(stack, fac_offset, average(color));
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, color);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_map_range.h b/intern/cycles/kernel/svm/svm_map_range.h
index 533a631c837..c8684981e31 100644
--- a/intern/cycles/kernel/svm/svm_map_range.h
+++ b/intern/cycles/kernel/svm/svm_map_range.h
@@ -24,13 +24,13 @@ ccl_device_inline float smootherstep(float edge0, float edge1, float x)
   return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
 }
 
-ccl_device void svm_node_map_range(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint value_stack_offset,
-                                   uint parameters_stack_offsets,
-                                   uint results_stack_offsets,
-                                   int *offset)
+ccl_device_noinline int svm_node_map_range(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint value_stack_offset,
+                                           uint parameters_stack_offsets,
+                                           uint results_stack_offsets,
+                                           int offset)
 {
   uint from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset;
   uint type_stack_offset, steps_stack_offset, result_stack_offset;
@@ -42,8 +42,8 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
   svm_unpack_node_uchar3(
       results_stack_offsets, &type_stack_offset, &steps_stack_offset, &result_stack_offset);
 
-  uint4 defaults = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float from_min = stack_load_float_default(stack, from_min_stack_offset, defaults.x);
@@ -83,6 +83,7 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
     result = 0.0f;
   }
   stack_store_float(stack, result_stack_offset, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index 6e19c859e19..fcc724405f5 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -18,13 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Mapping Node */
 
-ccl_device void svm_node_mapping(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float *stack,
-                                 uint type,
-                                 uint inputs_stack_offsets,
-                                 uint result_stack_offset,
-                                 int *offset)
+ccl_device_noinline void svm_node_mapping(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint type,
+                                          uint inputs_stack_offsets,
+                                          uint result_stack_offset)
 {
   uint vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset;
   svm_unpack_node_uchar4(inputs_stack_offsets,
@@ -44,30 +43,40 @@ ccl_device void svm_node_mapping(KernelGlobals *kg,
 
 /* Texture Mapping */
 
-ccl_device void svm_node_texture_mapping(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_texture_mapping(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint vec_offset,
+                                                 uint out_offset,
+                                                 int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
   Transform tfm;
-  tfm.x = read_node_float(kg, offset);
-  tfm.y = read_node_float(kg, offset);
-  tfm.z = read_node_float(kg, offset);
+  tfm.x = read_node_float(kg, &offset);
+  tfm.y = read_node_float(kg, &offset);
+  tfm.z = read_node_float(kg, &offset);
 
   float3 r = transform_point(&tfm, v);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
-ccl_device void svm_node_min_max(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_min_max(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint vec_offset,
+                                         uint out_offset,
+                                         int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
-  float3 mn = float4_to_float3(read_node_float(kg, offset));
-  float3 mx = float4_to_float3(read_node_float(kg, offset));
+  float3 mn = float4_to_float3(read_node_float(kg, &offset));
+  float3 mx = float4_to_float3(read_node_float(kg, &offset));
 
   float3 r = min(max(mn, v), mx);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 733ea28f9e5..99e7a8f2bda 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_math(KernelGlobals *kg,
-                              ShaderData *sd,
-                              float *stack,
-                              uint type,
-                              uint inputs_stack_offsets,
-                              uint result_stack_offset,
-                              int *offset)
+ccl_device_noinline void svm_node_math(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint type,
+                                       uint inputs_stack_offsets,
+                                       uint result_stack_offset)
 {
   uint a_stack_offset, b_stack_offset, c_stack_offset;
   svm_unpack_node_uchar3(inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &c_stack_offset);
@@ -35,13 +34,13 @@ ccl_device void svm_node_math(KernelGlobals *kg,
   stack_store_float(stack, result_stack_offset, result);
 }
 
-ccl_device void svm_node_vector_math(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint type,
-                                     uint inputs_stack_offsets,
-                                     uint outputs_stack_offsets,
-                                     int *offset)
+ccl_device_noinline int svm_node_vector_math(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint inputs_stack_offsets,
+                                             uint outputs_stack_offsets,
+                                             int offset)
 {
   uint value_stack_offset, vector_stack_offset;
   uint a_stack_offset, b_stack_offset, param1_stack_offset;
@@ -60,7 +59,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
   /* 3 Vector Operators */
   if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD ||
       type == NODE_VECTOR_MATH_MULTIPLY_ADD) {
-    uint4 extra_node = read_node(kg, offset);
+    uint4 extra_node = read_node(kg, &offset);
     c = stack_load_float3(stack, extra_node.x);
   }
 
@@ -70,6 +69,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
     stack_store_float(stack, value_stack_offset, value);
   if (stack_valid(vector_stack_offset))
     stack_store_float3(stack, vector_stack_offset, vector);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index 15114bfd5e4..3e38080977f 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -18,16 +18,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Node */
 
-ccl_device void svm_node_mix(KernelGlobals *kg,
-                             ShaderData *sd,
-                             float *stack,
-                             uint fac_offset,
-                             uint c1_offset,
-                             uint c2_offset,
-                             int *offset)
+ccl_device_noinline int svm_node_mix(const KernelGlobals *kg,
+                                     ShaderData *sd,
+                                     float *stack,
+                                     uint fac_offset,
+                                     uint c1_offset,
+                                     uint c2_offset,
+                                     int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
 
   float fac = stack_load_float(stack, fac_offset);
   float3 c1 = stack_load_float3(stack, c1_offset);
@@ -35,6 +35,7 @@ ccl_device void svm_node_mix(KernelGlobals *kg,
   float3 result = svm_mix((NodeMix)node1.y, fac, c1, c2);
 
   stack_store_float3(stack, node1.z, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 571f62fe27f..03a8b68b3ef 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -700,13 +700,13 @@ ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_4d(
   return value;
 }
 
-ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint offsets1,
-                                      uint offsets2,
-                                      uint offsets3,
-                                      int *offset)
+ccl_device_noinline int svm_node_tex_musgrave(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint offsets1,
+                                              uint offsets2,
+                                              uint offsets3,
+                                              int offset)
 {
   uint type, dimensions, co_stack_offset, w_stack_offset;
   uint scale_stack_offset, detail_stack_offset, dimension_stack_offset, lacunarity_stack_offset;
@@ -720,8 +720,8 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
                          &lacunarity_stack_offset);
   svm_unpack_node_uchar3(offsets3, &offset_stack_offset, &gain_stack_offset, &fac_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 co = stack_load_float3(stack, co_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -844,6 +844,7 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
   }
 
   stack_store_float(stack, fac_stack_offset, fac);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 94d8bfde555..ecb4df6afdf 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -330,7 +330,7 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
  *               |__________________________|
  *
  */
-ccl_device_noinline float perlin_2d(float x, float y)
+ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
   ssei XY;
   ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
@@ -447,7 +447,7 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
  *     v7      (1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -501,7 +501,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
@@ -585,7 +585,7 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
  *                 |__________________________|
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -637,7 +637,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 61fd9553802..29b262ac06e 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -140,13 +140,13 @@ ccl_device void noise_texture_4d(float4 co,
   }
 }
 
-ccl_device void svm_node_tex_noise(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint dimensions,
-                                   uint offsets1,
-                                   uint offsets2,
-                                   int *offset)
+ccl_device_noinline int svm_node_tex_noise(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint dimensions,
+                                           uint offsets1,
+                                           uint offsets2,
+                                           int offset)
 {
   uint vector_stack_offset, w_stack_offset, scale_stack_offset;
   uint detail_stack_offset, roughness_stack_offset, distortion_stack_offset;
@@ -160,8 +160,8 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
                          &value_stack_offset,
                          &color_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 vector = stack_load_float3(stack, vector_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -212,6 +212,7 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
   if (stack_valid(color_stack_offset)) {
     stack_store_float3(stack, color_stack_offset, color);
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 4cd3eab0ed2..724b5f281f9 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -16,16 +16,16 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_normal(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint in_normal_offset,
-                                uint out_normal_offset,
-                                uint out_dot_offset,
-                                int *offset)
+ccl_device_noinline int svm_node_normal(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        float *stack,
+                                        uint in_normal_offset,
+                                        uint out_normal_offset,
+                                        uint out_dot_offset,
+                                        int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 normal = stack_load_float3(stack, in_normal_offset);
 
   float3 direction;
@@ -39,6 +39,7 @@ ccl_device void svm_node_normal(KernelGlobals *kg,
 
   if (stack_valid(out_dot_offset))
     stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 85ccf39144b..e92df3c093c 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -21,8 +21,12 @@ CCL_NAMESPACE_BEGIN
 
 /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
 
-ccl_device_inline float4 rgb_ramp_lookup(
-    KernelGlobals *kg, int offset, float f, bool interpolate, bool extrapolate, int table_size)
+ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg,
+                                         int offset,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
 {
   if ((f < 0.0f || f > 1.0f) && extrapolate) {
     float4 t0, dy;
@@ -53,34 +57,35 @@ ccl_device_inline float4 rgb_ramp_lookup(
   return a;
 }
 
-ccl_device void svm_node_rgb_ramp(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_rgb_ramp(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, alpha_offset;
   uint interpolate = node.z;
 
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &alpha_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
-  float4 color = rgb_ramp_lookup(kg, *offset, fac, interpolate, false, table_size);
+  float4 color = rgb_ramp_lookup(kg, offset, fac, interpolate, false, table_size);
 
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, float4_to_float3(color));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, color.w);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
-ccl_device void svm_node_curves(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_curves(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, out_offset;
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &out_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
   float3 color = stack_load_float3(stack, color_offset);
@@ -89,14 +94,15 @@ ccl_device void svm_node_curves(
   const float range_x = max_x - min_x;
   const float3 relpos = (color - make_float3(min_x, min_x, min_x)) / range_x;
 
-  float r = rgb_ramp_lookup(kg, *offset, relpos.x, true, true, table_size).x;
-  float g = rgb_ramp_lookup(kg, *offset, relpos.y, true, true, table_size).y;
-  float b = rgb_ramp_lookup(kg, *offset, relpos.z, true, true, table_size).z;
+  float r = rgb_ramp_lookup(kg, offset, relpos.x, true, true, table_size).x;
+  float g = rgb_ramp_lookup(kg, offset, relpos.y, true, true, table_size).y;
+  float b = rgb_ramp_lookup(kg, offset, relpos.z, true, true, table_size).z;
 
   color = (1.0f - fac) * color + fac * make_float3(r, g, b);
   stack_store_float3(stack, out_offset, color);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index f501252062e..8d52845ea3d 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -16,15 +16,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint hue_in,
-                                     uint saturation_in,
-                                     uint value_in,
-                                     int *offset)
+ccl_device_noinline int svm_node_combine_hsv(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint hue_in,
+                                             uint saturation_in,
+                                             uint value_in,
+                                             int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint color_out = node1.y;
 
   float hue = stack_load_float(stack, hue_in);
@@ -36,17 +36,18 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
 
   if (stack_valid(color_out))
     stack_store_float3(stack, color_out, color);
+  return offset;
 }
 
-ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint color_in,
-                                      uint hue_out,
-                                      uint saturation_out,
-                                      int *offset)
+ccl_device_noinline int svm_node_separate_hsv(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint color_in,
+                                              uint hue_out,
+                                              uint saturation_out,
+                                              int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint value_out = node1.y;
 
   float3 color = stack_load_float3(stack, color_in);
@@ -60,6 +61,7 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
     stack_store_float(stack, saturation_out, color.y);
   if (stack_valid(value_out))
     stack_store_float(stack, value_out, color.z);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index b908732f026..b77c4311e72 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -37,7 +37,7 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-ccl_device float3 sky_radiance_preetham(KernelGlobals *kg,
+ccl_device float3 sky_radiance_preetham(const KernelGlobals *kg,
                                         float3 dir,
                                         float sunphi,
                                         float suntheta,
@@ -90,7 +90,7 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
           configuration[6] * mieM + configuration[7] * zenith);
 }
 
-ccl_device float3 sky_radiance_hosek(KernelGlobals *kg,
+ccl_device float3 sky_radiance_hosek(const KernelGlobals *kg,
                                      float3 dir,
                                      float sunphi,
                                      float suntheta,
@@ -127,7 +127,7 @@ ccl_device float3 geographical_to_direction(float lat, float lon)
   return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
 }
 
-ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
+ccl_device float3 sky_radiance_nishita(const KernelGlobals *kg,
                                        float3 dir,
                                        float *nishita_data,
                                        uint texture_id)
@@ -209,8 +209,8 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
   return xyz_to_rgb(kg, xyz);
 }
 
-ccl_device void svm_node_tex_sky(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_sky(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   /* Load data */
   uint dir_offset = node.y;
@@ -226,49 +226,49 @@ ccl_device void svm_node_tex_sky(
     float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
     float config_x[9], config_y[9], config_z[9];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     sunphi = data.x;
     suntheta = data.y;
     radiance_x = data.z;
     radiance_y = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     radiance_z = data.x;
     config_x[0] = data.y;
     config_x[1] = data.z;
     config_x[2] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[3] = data.x;
     config_x[4] = data.y;
     config_x[5] = data.z;
     config_x[6] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[7] = data.x;
     config_x[8] = data.y;
     config_y[0] = data.z;
     config_y[1] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[2] = data.x;
     config_y[3] = data.y;
     config_y[4] = data.z;
     config_y[5] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[6] = data.x;
     config_y[7] = data.y;
     config_y[8] = data.z;
     config_z[0] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[1] = data.x;
     config_z[2] = data.y;
     config_z[3] = data.z;
     config_z[4] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[5] = data.x;
     config_z[6] = data.y;
     config_z[7] = data.z;
@@ -305,19 +305,19 @@ ccl_device void svm_node_tex_sky(
     /* Define variables */
     float nishita_data[10];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     nishita_data[0] = data.x;
     nishita_data[1] = data.y;
     nishita_data[2] = data.z;
     nishita_data[3] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[4] = data.x;
     nishita_data[5] = data.y;
     nishita_data[6] = data.z;
     nishita_data[7] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[8] = data.x;
     nishita_data[9] = data.y;
     uint texture_id = __float_as_uint(data.z);
@@ -327,6 +327,7 @@ ccl_device void svm_node_tex_sky(
   }
 
   stack_store_float3(stack, out_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 46600551cc4..a35253080da 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/geom/geom.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture Coordinate Node */
 
-ccl_device void svm_node_tex_coord(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
   float3 data;
   uint type = node.y;
@@ -35,9 +39,9 @@ ccl_device void svm_node_tex_coord(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -92,10 +96,11 @@ ccl_device void svm_node_tex_coord(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 }
 
-ccl_device void svm_node_tex_coord_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -112,9 +117,9 @@ ccl_device void svm_node_tex_coord_bump_dx(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -136,7 +141,7 @@ ccl_device void svm_node_tex_coord_bump_dx(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
       data.z = 0.0f;
@@ -169,13 +174,14 @@ ccl_device void svm_node_tex_coord_bump_dx(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_tex_coord_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -192,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -216,7 +222,7 @@ ccl_device void svm_node_tex_coord_bump_dy(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
       data.z = 0.0f;
@@ -249,12 +255,16 @@ ccl_device void svm_node_tex_coord_bump_dy(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_normal_map(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint4 node)
 {
   uint color_offset, strength_offset, normal_offset, space;
   svm_unpack_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
@@ -346,7 +356,10 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
   stack_store_float3(stack, normal_offset, N);
 }
 
-ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tangent(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint4 node)
 {
   uint tangent_offset, direction_type, axis;
   svm_unpack_node_uchar3(node.y, &tangent_offset, &direction_type, &axis);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 062afcfa5ac..c053be96c51 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -30,37 +30,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Nodes */
 
-/* Known frequencies of used nodes, used for selective nodes compilation
- * in the kernel. Currently only affects split OpenCL kernel.
- *
- * Keep as defines so it's easy to check which nodes are to be compiled
- * from preprocessor.
- *
- * Lower the number of group more often the node is used.
- */
-#define NODE_GROUP_LEVEL_0 0
-#define NODE_GROUP_LEVEL_1 1
-#define NODE_GROUP_LEVEL_2 2
-#define NODE_GROUP_LEVEL_3 3
-#define NODE_GROUP_LEVEL_4 4
-#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4
-
-#define NODE_FEATURE_VOLUME (1 << 0)
-#define NODE_FEATURE_HAIR (1 << 1)
-#define NODE_FEATURE_BUMP (1 << 2)
-#define NODE_FEATURE_BUMP_STATE (1 << 3)
-#define NODE_FEATURE_VORONOI_EXTRA (1 << 4)
-/* TODO(sergey): Consider using something like ((uint)(-1)).
- * Need to check carefully operand types around usage of this
- * define first.
- */
-#define NODE_FEATURE_ALL \
-  (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE | \
-   NODE_FEATURE_VORONOI_EXTRA)
-
-#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
-#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
-
 typedef enum ShaderNodeType {
   NODE_END = 0,
   NODE_SHADER_JUMP,
@@ -572,12 +541,8 @@ typedef enum ClosureType {
   CLOSURE_BSDF_TRANSPARENT_ID,
 
   /* BSSRDF */
-  CLOSURE_BSSRDF_CUBIC_ID,
-  CLOSURE_BSSRDF_GAUSSIAN_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_ID,
-  CLOSURE_BSSRDF_BURLEY_ID,
   CLOSURE_BSSRDF_RANDOM_WALK_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID,
+  CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID,
 
   /* Other */
   CLOSURE_HOLDOUT_ID,
@@ -620,11 +585,9 @@ typedef enum ClosureType {
    type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
-#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
-#define CLOSURE_IS_DISK_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
+  (type >= CLOSURE_BSSRDF_RANDOM_WALK_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_VOLUME(type) \
   (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
 #define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index 5b76f2c8832..d0478660094 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -19,20 +19,21 @@ CCL_NAMESPACE_BEGIN
 /* Value Nodes */
 
 ccl_device void svm_node_value_f(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
 {
   stack_store_float(stack, out_offset, __uint_as_float(ivalue));
 }
 
-ccl_device void svm_node_value_v(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset)
+ccl_device int svm_node_value_v(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 p = make_float3(
       __uint_as_float(node1.y), __uint_as_float(node1.z), __uint_as_float(node1.w));
 
   stack_store_float3(stack, out_offset, p);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h
index 50045752484..55e1bce0158 100644
--- a/intern/cycles/kernel/svm/svm_vector_rotate.h
+++ b/intern/cycles/kernel/svm/svm_vector_rotate.h
@@ -18,11 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Rotate */
 
-ccl_device void svm_node_vector_rotate(ShaderData *sd,
-                                       float *stack,
-                                       uint input_stack_offsets,
-                                       uint axis_stack_offsets,
-                                       uint result_stack_offset)
+ccl_device_noinline void svm_node_vector_rotate(ShaderData *sd,
+                                                float *stack,
+                                                uint input_stack_offsets,
+                                                uint axis_stack_offsets,
+                                                uint result_stack_offset)
 {
   uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset,
       angle_stack_offset, invert;
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 1e95492cf1b..8aedb7e0f54 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Transform */
 
-ccl_device void svm_node_vector_transform(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          float *stack,
-                                          uint4 node)
+ccl_device_noinline void svm_node_vector_transform(const KernelGlobals *kg,
+                                                   ShaderData *sd,
+                                                   float *stack,
+                                                   uint4 node)
 {
   uint itype, ifrom, ito;
   uint vector_in, vector_out;
diff --git a/intern/cycles/kernel/svm/svm_vertex_color.h b/intern/cycles/kernel/svm/svm_vertex_color.h
index 0aa45835522..986ea244f3a 100644
--- a/intern/cycles/kernel/svm/svm_vertex_color.h
+++ b/intern/cycles/kernel/svm/svm_vertex_color.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_vertex_color(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint layer_id,
-                                      uint color_offset,
-                                      uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint layer_id,
+                                               uint color_offset,
+                                               uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -35,18 +35,12 @@ ccl_device void svm_node_vertex_color(KernelGlobals *kg,
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dx(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dx(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -62,18 +56,12 @@ ccl_device_noinline
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dy(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dy(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index d0e7db35fab..b1d2eff7f37 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -902,16 +902,17 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float
   *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
 }
 
-ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint dimensions,
-                                     uint feature,
-                                     uint metric,
-                                     int *offset)
+template<uint node_feature_mask>
+ccl_device_noinline int svm_node_tex_voronoi(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint dimensions,
+                                             uint feature,
+                                             uint metric,
+                                             int offset)
 {
-  uint4 stack_offsets = read_node(kg, offset);
-  uint4 defaults = read_node(kg, offset);
+  uint4 stack_offsets = read_node(kg, &offset);
+  uint4 defaults = read_node(kg, &offset);
 
   uint coord_stack_offset, w_stack_offset, scale_stack_offset, smoothness_stack_offset;
   uint exponent_stack_offset, randomness_stack_offset, distance_out_stack_offset,
@@ -997,18 +998,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out_2d);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_2d(coord_2d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_2d);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_2d(coord_2d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_2d);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_2d(coord_2d,
                         exponent,
@@ -1042,18 +1043,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_3d(coord,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_3d(coord,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_3d(coord,
                         exponent,
@@ -1076,54 +1077,54 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
       break;
     }
 
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
     case 4: {
-      float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
-      float4 position_out_4d;
-      switch (voronoi_feature) {
-        case NODE_VORONOI_F1:
-          voronoi_f1_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_4d(coord_4d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_4d);
-          break;
-        case NODE_VORONOI_F2:
-          voronoi_f2_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_DISTANCE_TO_EDGE:
-          voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
-          break;
-        case NODE_VORONOI_N_SPHERE_RADIUS:
-          voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
-          break;
-        default:
-          kernel_assert(0);
+      if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+        float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
+        float4 position_out_4d;
+        switch (voronoi_feature) {
+          case NODE_VORONOI_F1:
+            voronoi_f1_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_SMOOTH_F1:
+            voronoi_smooth_f1_4d(coord_4d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_4d);
+            break;
+          case NODE_VORONOI_F2:
+            voronoi_f2_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_DISTANCE_TO_EDGE:
+            voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
+            break;
+          case NODE_VORONOI_N_SPHERE_RADIUS:
+            voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
+            break;
+          default:
+            kernel_assert(0);
+        }
+        position_out_4d = safe_divide_float4_float(position_out_4d, scale);
+        position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
+        w_out = position_out_4d.w;
       }
-      position_out_4d = safe_divide_float4_float(position_out_4d, scale);
-      position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
-      w_out = position_out_4d.w;
       break;
     }
-#endif
     default:
       kernel_assert(0);
   }
@@ -1138,6 +1139,7 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
     stack_store_float(stack, w_out_stack_offset, w_out);
   if (stack_valid(radius_out_stack_offset))
     stack_store_float(stack, radius_out_stack_offset, radius_out);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 4bc14f82382..78b75405356 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -19,8 +19,8 @@ CCL_NAMESPACE_BEGIN
 /* TODO(sergey): Think of making it more generic volume-type attribute
  * sampler.
  */
-ccl_device void svm_node_tex_voxel(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_voxel(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, density_out_offset, color_out_offset, space;
   svm_unpack_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
@@ -33,9 +33,9 @@ ccl_device void svm_node_tex_voxel(
   else {
     kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD);
     Transform tfm;
-    tfm.x = read_node_float(kg, offset);
-    tfm.y = read_node_float(kg, offset);
-    tfm.z = read_node_float(kg, offset);
+    tfm.x = read_node_float(kg, &offset);
+    tfm.y = read_node_float(kg, &offset);
+    tfm.z = read_node_float(kg, &offset);
     co = transform_point(&tfm, co);
   }
 
@@ -47,6 +47,7 @@ ccl_device void svm_node_tex_voxel(
     stack_store_float(stack, density_out_offset, r.w);
   if (stack_valid(color_out_offset))
     stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index c4763475b47..00f980c16df 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -82,11 +82,11 @@ ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
   }
 }
 
-ccl_device void svm_node_tex_wave(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_wave(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
 
   /* RNA properties */
   uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset;
@@ -125,6 +125,7 @@ ccl_device void svm_node_tex_wave(
     stack_store_float(stack, fac_offset, f);
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, make_float3(f, f, f));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index d6144802559..fba8aa63d31 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -69,8 +69,8 @@ ccl_static_constant float cie_colour_match[81][3] = {
     {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
     {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
 
-ccl_device void svm_node_wavelength(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
+ccl_device_noinline void svm_node_wavelength(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
 {
   float lambda_nm = stack_load_float(stack, wavelength);
   float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f);  // scaled 0..80
diff --git a/intern/cycles/kernel/svm/svm_white_noise.h b/intern/cycles/kernel/svm/svm_white_noise.h
index b30d85acaec..0306d2e7b9c 100644
--- a/intern/cycles/kernel/svm/svm_white_noise.h
+++ b/intern/cycles/kernel/svm/svm_white_noise.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_tex_white_noise(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint dimensions,
-                                         uint inputs_stack_offsets,
-                                         uint ouptuts_stack_offsets,
-                                         int *offset)
+ccl_device_noinline void svm_node_tex_white_noise(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint dimensions,
+                                                  uint inputs_stack_offsets,
+                                                  uint ouptuts_stack_offsets)
 {
   uint vector_stack_offset, w_stack_offset, value_stack_offset, color_stack_offset;
   svm_unpack_node_uchar2(inputs_stack_offsets, &vector_stack_offset, &w_stack_offset);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 49158bd86d5..7ec913789d2 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -35,7 +35,7 @@ CCL_NAMESPACE_BEGIN
 /* Wireframe Node */
 
 ccl_device_inline float wireframe(
-    KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
+    const KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
 {
 #ifdef __HAIR__
   if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
@@ -88,7 +88,10 @@ ccl_device_inline float wireframe(
   return 0.0f;
 }
 
-ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_wireframe(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint4 node)
 {
   uint in_size = node.y;
   uint out_fac = node.z;
@@ -100,18 +103,7 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
   int pixel_size = (int)use_pixel_size;
 
   /* Calculate wireframe */
-#ifdef __SPLIT_KERNEL__
-  /* TODO(sergey): This is because sd is actually a global space,
-   * which makes it difficult to re-use same wireframe() function.
-   *
-   * With OpenCL 2.0 it's possible to avoid this change, but for until
-   * then we'll be living with such an exception.
-   */
-  float3 P = sd->P;
-  float f = wireframe(kg, sd, size, pixel_size, &P);
-#else
   float f = wireframe(kg, sd, size, pixel_size, &sd->P);
-#endif
 
   /* TODO(sergey): Think of faster way to calculate derivatives. */
   if (bump_offset == NODE_BUMP_OFFSET_DX) {
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index feead27c5ca..6edb5261b32 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -32,10 +32,10 @@ set(SRC
   camera.cpp
   colorspace.cpp
   constant_fold.cpp
-  coverage.cpp
   denoising.cpp
   film.cpp
   geometry.cpp
+  gpu_display.cpp
   graph.cpp
   hair.cpp
   image.cpp
@@ -54,6 +54,7 @@ set(SRC
   object.cpp
   osl.cpp
   particles.cpp
+  pass.cpp
   curves.cpp
   scene.cpp
   session.cpp
@@ -76,10 +77,10 @@ set(SRC_HEADERS
   camera.h
   colorspace.h
   constant_fold.h
-  coverage.h
   denoising.h
   film.h
   geometry.h
+  gpu_display.h
   graph.h
   hair.h
   image.h
@@ -95,6 +96,7 @@ set(SRC_HEADERS
   object.h
   osl.h
   particles.h
+  pass.h
   procedural.h
   curves.h
   scene.h
@@ -111,6 +113,7 @@ set(SRC_HEADERS
 set(LIB
   cycles_bvh
   cycles_device
+  cycles_integrator
   cycles_subd
   cycles_util
 )
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index b925e755434..ae6290ac27b 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -34,11 +34,7 @@ NODE_DEFINE(Background)
 {
   NodeType *type = NodeType::add("background", create);
 
-  SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
-  SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
-
   SOCKET_BOOLEAN(use_shader, "Use Shader", true);
-  SOCKET_BOOLEAN(use_ao, "Use AO", false);
   SOCKET_UINT(visibility, "Visibility", PATH_RAY_ALL_VISIBILITY);
 
   SOCKET_BOOLEAN(transparent, "Transparent", false);
@@ -80,10 +76,6 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
   /* set shader index and transparent option */
   KernelBackground *kbackground = &dscene->data.background;
 
-  kbackground->ao_factor = (use_ao) ? ao_factor : 0.0f;
-  kbackground->ao_bounces_factor = ao_factor;
-  kbackground->ao_distance = ao_distance;
-
   kbackground->transparent = transparent;
   kbackground->surface_shader = scene->shader_manager->get_shader_id(bg_shader);
 
@@ -138,10 +130,6 @@ void Background::tag_update(Scene *scene)
      * and to avoid doing unnecessary updates anywhere else. */
     tag_use_shader_modified();
   }
-
-  if (ao_factor_is_modified() || use_ao_is_modified()) {
-    scene->integrator->tag_update(scene, Integrator::BACKGROUND_AO_MODIFIED);
-  }
 }
 
 Shader *Background::get_shader(const Scene *scene)
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index e89ffbc2445..2f7ef0f7737 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -32,11 +32,7 @@ class Background : public Node {
  public:
   NODE_DECLARE
 
-  NODE_SOCKET_API(float, ao_factor)
-  NODE_SOCKET_API(float, ao_distance)
-
   NODE_SOCKET_API(bool, use_shader)
-  NODE_SOCKET_API(bool, use_ao)
 
   NODE_SOCKET_API(uint, visibility)
   NODE_SOCKET_API(Shader *, shader)
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 317a3937cab..54e496caed6 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -26,58 +26,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-static int aa_samples(Scene *scene, Object *object, ShaderEvalType type)
-{
-  if (type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) {
-    return 1;
-  }
-  else if (type == SHADER_EVAL_NORMAL) {
-    /* Only antialias normal if mesh has bump mapping. */
-    if (object->get_geometry()) {
-      foreach (Node *node, object->get_geometry()->get_used_shaders()) {
-        Shader *shader = static_cast<Shader *>(node);
-        if (shader->has_bump) {
-          return scene->integrator->get_aa_samples();
-        }
-      }
-    }
-
-    return 1;
-  }
-  else {
-    return scene->integrator->get_aa_samples();
-  }
-}
-
-/* Keep it synced with kernel_bake.h logic */
-static int shader_type_to_pass_filter(ShaderEvalType type, int pass_filter)
-{
-  const int component_flags = pass_filter &
-                              (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_COLOR);
-
-  switch (type) {
-    case SHADER_EVAL_AO:
-      return BAKE_FILTER_AO;
-    case SHADER_EVAL_SHADOW:
-      return BAKE_FILTER_DIRECT;
-    case SHADER_EVAL_DIFFUSE:
-      return BAKE_FILTER_DIFFUSE | component_flags;
-    case SHADER_EVAL_GLOSSY:
-      return BAKE_FILTER_GLOSSY | component_flags;
-    case SHADER_EVAL_TRANSMISSION:
-      return BAKE_FILTER_TRANSMISSION | component_flags;
-    case SHADER_EVAL_COMBINED:
-      return pass_filter;
-    default:
-      return 0;
-  }
-}
-
 BakeManager::BakeManager()
 {
-  type = SHADER_EVAL_BAKE;
-  pass_filter = 0;
-
   need_update_ = true;
 }
 
@@ -85,32 +35,14 @@ BakeManager::~BakeManager()
 {
 }
 
-bool BakeManager::get_baking()
+bool BakeManager::get_baking() const
 {
   return !object_name.empty();
 }
 
-void BakeManager::set(Scene *scene,
-                      const std::string &object_name_,
-                      ShaderEvalType type_,
-                      int pass_filter_)
+void BakeManager::set(Scene *scene, const std::string &object_name_)
 {
   object_name = object_name_;
-  type = type_;
-  pass_filter = shader_type_to_pass_filter(type_, pass_filter_);
-
-  Pass::add(PASS_BAKE_PRIMITIVE, scene->passes);
-  Pass::add(PASS_BAKE_DIFFERENTIAL, scene->passes);
-
-  if (type == SHADER_EVAL_UV) {
-    /* force UV to be available */
-    Pass::add(PASS_UV, scene->passes);
-  }
-
-  /* force use_light_pass to be true if we bake more than just colors */
-  if (pass_filter & ~BAKE_FILTER_COLOR) {
-    Pass::add(PASS_LIGHT, scene->passes);
-  }
 
   /* create device and update scene */
   scene->film->tag_modified();
@@ -127,29 +59,29 @@ void BakeManager::device_update(Device * /*device*/,
   if (!need_update())
     return;
 
-  scoped_callback_timer timer([scene](double time) {
-    if (scene->update_stats) {
-      scene->update_stats->bake.times.add_entry({"device_update", time});
-    }
-  });
-
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
   KernelBake *kbake = &dscene->data.bake;
+  memset(kbake, 0, sizeof(*kbake));
 
-  kbake->type = type;
-  kbake->pass_filter = pass_filter;
-
-  int object_index = 0;
-  foreach (Object *object, scene->objects) {
-    const Geometry *geom = object->get_geometry();
-    if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
-      kbake->object_index = object_index;
-      kbake->tri_offset = geom->prim_offset;
-      kintegrator->aa_samples = aa_samples(scene, object, type);
-      break;
-    }
+  if (!object_name.empty()) {
+    scoped_callback_timer timer([scene](double time) {
+      if (scene->update_stats) {
+        scene->update_stats->bake.times.add_entry({"device_update", time});
+      }
+    });
+
+    kbake->use = true;
 
-    object_index++;
+    int object_index = 0;
+    foreach (Object *object, scene->objects) {
+      const Geometry *geom = object->get_geometry();
+      if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
+        kbake->object_index = object_index;
+        kbake->tri_offset = geom->prim_offset;
+        break;
+      }
+
+      object_index++;
+    }
   }
 
   need_update_ = false;
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 655b9b1cf7e..39e504490c2 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -30,8 +30,8 @@ class BakeManager {
   BakeManager();
   ~BakeManager();
 
-  void set(Scene *scene, const std::string &object_name, ShaderEvalType type, int pass_filter);
-  bool get_baking();
+  void set(Scene *scene, const std::string &object_name);
+  bool get_baking() const;
 
   void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
   void device_free(Device *device, DeviceScene *dscene);
@@ -42,8 +42,6 @@ class BakeManager {
 
  private:
   bool need_update_;
-  ShaderEvalType type;
-  int pass_filter;
   std::string object_name;
 };
 
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fcfad58995e..1cdae3af7f5 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -28,537 +28,334 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Buffer Params */
+/* --------------------------------------------------------------------
+ * Convert part information to an index of `BufferParams::pass_offset_`.
+ */
 
-BufferParams::BufferParams()
+static int pass_type_mode_to_index(PassType pass_type, PassMode mode)
 {
-  width = 0;
-  height = 0;
-
-  full_x = 0;
-  full_y = 0;
-  full_width = 0;
-  full_height = 0;
+  int index = static_cast<int>(pass_type) * 2;
 
-  denoising_data_pass = false;
-  denoising_clean_pass = false;
-  denoising_prefiltered_pass = false;
+  if (mode == PassMode::DENOISED) {
+    ++index;
+  }
 
-  Pass::add(PASS_COMBINED, passes);
+  return index;
 }
 
-void BufferParams::get_offset_stride(int &offset, int &stride)
+static int pass_to_index(const BufferPass &pass)
 {
-  offset = -(full_x + full_y * width);
-  stride = width;
+  return pass_type_mode_to_index(pass.type, pass.mode);
 }
 
-bool BufferParams::modified(const BufferParams &params)
-{
-  return !(full_x == params.full_x && full_y == params.full_y && width == params.width &&
-           height == params.height && full_width == params.full_width &&
-           full_height == params.full_height && Pass::equals(passes, params.passes) &&
-           denoising_data_pass == params.denoising_data_pass &&
-           denoising_clean_pass == params.denoising_clean_pass &&
-           denoising_prefiltered_pass == params.denoising_prefiltered_pass);
-}
+/* --------------------------------------------------------------------
+ * Buffer pass.
+ */
 
-int BufferParams::get_passes_size()
+NODE_DEFINE(BufferPass)
 {
-  int size = 0;
+  NodeType *type = NodeType::add("buffer_pass", create);
 
-  for (size_t i = 0; i < passes.size(); i++)
-    size += passes[i].components;
+  const NodeEnum *pass_type_enum = Pass::get_type_enum();
+  const NodeEnum *pass_mode_enum = Pass::get_mode_enum();
 
-  if (denoising_data_pass) {
-    size += DENOISING_PASS_SIZE_BASE;
-    if (denoising_clean_pass)
-      size += DENOISING_PASS_SIZE_CLEAN;
-    if (denoising_prefiltered_pass)
-      size += DENOISING_PASS_SIZE_PREFILTERED;
-  }
+  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+  SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+  SOCKET_STRING(name, "Name", ustring());
+  SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
 
-  return align_up(size, 4);
-}
+  SOCKET_INT(offset, "Offset", -1);
 
-int BufferParams::get_denoising_offset()
-{
-  int offset = 0;
-
-  for (size_t i = 0; i < passes.size(); i++)
-    offset += passes[i].components;
-
-  return offset;
+  return type;
 }
 
-int BufferParams::get_denoising_prefiltered_offset()
+BufferPass::BufferPass() : Node(get_node_type())
 {
-  assert(denoising_prefiltered_pass);
-
-  int offset = get_denoising_offset();
-
-  offset += DENOISING_PASS_SIZE_BASE;
-  if (denoising_clean_pass) {
-    offset += DENOISING_PASS_SIZE_CLEAN;
-  }
-
-  return offset;
 }
 
-/* Render Buffer Task */
-
-RenderTile::RenderTile()
+BufferPass::BufferPass(const Pass *scene_pass)
+    : Node(get_node_type()),
+      type(scene_pass->get_type()),
+      mode(scene_pass->get_mode()),
+      name(scene_pass->get_name()),
+      include_albedo(scene_pass->get_include_albedo())
 {
-  x = 0;
-  y = 0;
-  w = 0;
-  h = 0;
-
-  sample = 0;
-  start_sample = 0;
-  num_samples = 0;
-  resolution = 0;
-
-  offset = 0;
-  stride = 0;
-
-  buffer = 0;
-
-  buffers = NULL;
-  stealing_state = NO_STEALING;
 }
 
-/* Render Buffers */
-
-RenderBuffers::RenderBuffers(Device *device)
-    : buffer(device, "RenderBuffers", MEM_READ_WRITE),
-      map_neighbor_copied(false),
-      render_time(0.0f)
+PassInfo BufferPass::get_info() const
 {
+  return Pass::get_info(type, include_albedo);
 }
 
-RenderBuffers::~RenderBuffers()
-{
-  buffer.free();
-}
+/* --------------------------------------------------------------------
+ * Buffer Params.
+ */
 
-void RenderBuffers::reset(BufferParams &params_)
+NODE_DEFINE(BufferParams)
 {
-  params = params_;
-
-  /* re-allocate buffer */
-  buffer.alloc(params.width * params.get_passes_size(), params.height);
-  buffer.zero_to_device();
+  NodeType *type = NodeType::add("buffer_params", create);
+
+  SOCKET_INT(width, "Width", 0);
+  SOCKET_INT(height, "Height", 0);
+
+  SOCKET_INT(full_x, "Full X", 0);
+  SOCKET_INT(full_y, "Full Y", 0);
+  SOCKET_INT(full_width, "Full Width", 0);
+  SOCKET_INT(full_height, "Full Height", 0);
+
+  SOCKET_STRING(layer, "Layer", ustring());
+  SOCKET_STRING(view, "View", ustring());
+  SOCKET_FLOAT(exposure, "Exposure", 1.0f);
+  SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+  SOCKET_BOOLEAN(use_transparent_background, "Transparent Background", false);
+
+  /* Notes:
+   *  - Skip passes since they do not follow typical container socket definition.
+   *    Might look into covering those as a socket in the future.
+   *
+   *  - Skip offset, stride, and pass stride since those can be delivered from the passes and
+   *    rest of the sockets. */
+
+  return type;
 }
 
-void RenderBuffers::zero()
+BufferParams::BufferParams() : Node(get_node_type())
 {
-  buffer.zero_to_device();
+  reset_pass_offset();
 }
 
-bool RenderBuffers::copy_from_device()
+void BufferParams::update_passes()
 {
-  if (!buffer.device_pointer)
-    return false;
-
-  buffer.copy_from_device(0, params.width * params.get_passes_size(), params.height);
-
-  return true;
-}
-
-bool RenderBuffers::get_denoising_pass_rect(
-    int type, float exposure, int sample, int components, float *pixels)
-{
-  if (buffer.data() == NULL) {
-    return false;
-  }
-
-  float scale = 1.0f;
-  float alpha_scale = 1.0f / sample;
-  if (type == DENOISING_PASS_PREFILTERED_COLOR || type == DENOISING_PASS_CLEAN ||
-      type == DENOISING_PASS_PREFILTERED_INTENSITY) {
-    scale *= exposure;
-  }
-  else if (type == DENOISING_PASS_PREFILTERED_VARIANCE) {
-    scale *= exposure * exposure * (sample - 1);
-  }
+  update_offset_stride();
+  reset_pass_offset();
+
+  pass_stride = 0;
+  for (const BufferPass &pass : passes) {
+    if (pass.offset != PASS_UNUSED) {
+      const int index = pass_to_index(pass);
+      if (pass_offset_[index] == PASS_UNUSED) {
+        pass_offset_[index] = pass_stride;
+      }
 
-  int offset;
-  if (type == DENOISING_PASS_CLEAN) {
-    /* The clean pass isn't changed by prefiltering, so we use the original one there. */
-    offset = type + params.get_denoising_offset();
-    scale /= sample;
-  }
-  else if (params.denoising_prefiltered_pass) {
-    offset = type + params.get_denoising_prefiltered_offset();
-  }
-  else {
-    switch (type) {
-      case DENOISING_PASS_PREFILTERED_DEPTH:
-        offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH;
-        break;
-      case DENOISING_PASS_PREFILTERED_NORMAL:
-        offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL;
-        break;
-      case DENOISING_PASS_PREFILTERED_ALBEDO:
-        offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO;
-        break;
-      case DENOISING_PASS_PREFILTERED_COLOR:
-        /* If we're not saving the prefiltering result, return the original noisy pass. */
-        offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
-        break;
-      default:
-        return false;
+      pass_stride += pass.get_info().num_components;
     }
-    scale /= sample;
   }
+}
 
-  int pass_stride = params.get_passes_size();
-  int size = params.width * params.height;
+void BufferParams::update_passes(const vector<Pass *> &scene_passes)
+{
+  passes.clear();
 
-  float *in = buffer.data() + offset;
+  pass_stride = 0;
+  for (const Pass *scene_pass : scene_passes) {
+    BufferPass buffer_pass(scene_pass);
 
-  if (components == 1) {
-    for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-      pixels[0] = in[0] * scale;
+    if (scene_pass->is_written()) {
+      buffer_pass.offset = pass_stride;
+      pass_stride += scene_pass->get_info().num_components;
     }
-  }
-  else if (components == 3) {
-    for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-      pixels[0] = in[0] * scale;
-      pixels[1] = in[1] * scale;
-      pixels[2] = in[2] * scale;
-    }
-  }
-  else if (components == 4) {
-    /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
-    assert(params.passes[0].type == PASS_COMBINED);
-    float *in_combined = buffer.data();
-
-    for (int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
-      float3 val = make_float3(in[0], in[1], in[2]);
-      if (type == DENOISING_PASS_PREFILTERED_COLOR && params.denoising_prefiltered_pass) {
-        /* Remove highlight compression from the image. */
-        val = color_highlight_uncompress(val);
-      }
-      pixels[0] = val.x * scale;
-      pixels[1] = val.y * scale;
-      pixels[2] = val.z * scale;
-      pixels[3] = saturate(in_combined[3] * alpha_scale);
+    else {
+      buffer_pass.offset = PASS_UNUSED;
     }
-  }
-  else {
-    return false;
+
+    passes.emplace_back(std::move(buffer_pass));
   }
 
-  return true;
+  update_passes();
 }
 
-bool RenderBuffers::get_pass_rect(
-    const string &name, float exposure, int sample, int components, float *pixels)
+void BufferParams::reset_pass_offset()
 {
-  if (buffer.data() == NULL) {
-    return false;
+  for (int i = 0; i < kNumPassOffsets; ++i) {
+    pass_offset_[i] = PASS_UNUSED;
   }
+}
 
-  float *sample_count = NULL;
-  if (name == "Combined") {
-    int sample_offset = 0;
-    for (size_t j = 0; j < params.passes.size(); j++) {
-      Pass &pass = params.passes[j];
-      if (pass.type != PASS_SAMPLE_COUNT) {
-        sample_offset += pass.components;
-        continue;
-      }
-      else {
-        sample_count = buffer.data() + sample_offset;
-        break;
-      }
-    }
+int BufferParams::get_pass_offset(PassType pass_type, PassMode mode) const
+{
+  if (pass_type == PASS_NONE || pass_type == PASS_UNUSED) {
+    return PASS_UNUSED;
   }
 
-  int pass_offset = 0;
-
-  for (size_t j = 0; j < params.passes.size(); j++) {
-    Pass &pass = params.passes[j];
+  const int index = pass_type_mode_to_index(pass_type, mode);
+  return pass_offset_[index];
+}
 
-    /* Pass is identified by both type and name, multiple of the same type
-     * may exist with a different name. */
-    if (pass.name != name) {
-      pass_offset += pass.components;
-      continue;
+const BufferPass *BufferParams::find_pass(string_view name) const
+{
+  for (const BufferPass &pass : passes) {
+    if (pass.name == name) {
+      return &pass;
     }
+  }
 
-    PassType type = pass.type;
-
-    float *in = buffer.data() + pass_offset;
-    int pass_stride = params.get_passes_size();
-
-    float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f;
-    float scale_exposure = (pass.exposure) ? scale * exposure : scale;
-
-    int size = params.width * params.height;
+  return nullptr;
+}
 
-    if (components == 1 && type == PASS_RENDER_TIME) {
-      /* Render time is not stored by kernel, but measured per tile. */
-      float val = (float)(1000.0 * render_time / (params.width * params.height * sample));
-      for (int i = 0; i < size; i++, pixels++) {
-        pixels[0] = val;
-      }
-    }
-    else if (components == 1) {
-      assert(pass.components == components);
-
-      /* Scalar */
-      if (type == PASS_DEPTH) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
-        }
-      }
-      else if (type == PASS_MIST) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = saturate(f * scale_exposure);
-        }
-      }
-      else {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = f * scale_exposure;
-        }
-      }
-    }
-    else if (components == 3) {
-      assert(pass.components == 4);
-
-      /* RGBA */
-      if (type == PASS_SHADOW) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-        }
-      }
-      else if (pass.divide_type != PASS_NONE) {
-        /* RGB lighting passes that need to divide out color */
-        pass_offset = 0;
-        for (size_t k = 0; k < params.passes.size(); k++) {
-          Pass &color_pass = params.passes[k];
-          if (color_pass.type == pass.divide_type)
-            break;
-          pass_offset += color_pass.components;
-        }
-
-        float *in_divide = buffer.data() + pass_offset;
-
-        for (int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) {
-          float3 f = make_float3(in[0], in[1], in[2]);
-          float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
-
-          f = safe_divide_even_color(f * exposure, f_divide);
-
-          pixels[0] = f.x;
-          pixels[1] = f.y;
-          pixels[2] = f.z;
-        }
-      }
-      else {
-        /* RGB/vector */
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-          float3 f = make_float3(in[0], in[1], in[2]);
-
-          pixels[0] = f.x * scale_exposure;
-          pixels[1] = f.y * scale_exposure;
-          pixels[2] = f.z * scale_exposure;
-        }
-      }
-    }
-    else if (components == 4) {
-      assert(pass.components == components);
-
-      /* RGBA */
-      if (type == PASS_SHADOW) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-          pixels[3] = 1.0f;
-        }
-      }
-      else if (type == PASS_MOTION) {
-        /* need to normalize by number of samples accumulated for motion */
-        pass_offset = 0;
-        for (size_t k = 0; k < params.passes.size(); k++) {
-          Pass &color_pass = params.passes[k];
-          if (color_pass.type == PASS_MOTION_WEIGHT)
-            break;
-          pass_offset += color_pass.components;
-        }
-
-        float *in_weight = buffer.data() + pass_offset;
-
-        for (int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float w = in_weight[0];
-          float invw = (w > 0.0f) ? 1.0f / w : 0.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-          pixels[3] = f.w * invw;
-        }
-      }
-      else if (type == PASS_CRYPTOMATTE) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          /* x and z contain integer IDs, don't rescale them.
-             y and w contain matte weights, they get scaled. */
-          pixels[0] = f.x;
-          pixels[1] = f.y * scale;
-          pixels[2] = f.z;
-          pixels[3] = f.w * scale;
-        }
-      }
-      else {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
-            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
-            scale_exposure = (pass.exposure) ? scale * exposure : scale;
-          }
-
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-
-          pixels[0] = f.x * scale_exposure;
-          pixels[1] = f.y * scale_exposure;
-          pixels[2] = f.z * scale_exposure;
-
-          /* Clamp since alpha might be > 1.0 due to Russian roulette. */
-          pixels[3] = saturate(f.w * scale);
-        }
-      }
+const BufferPass *BufferParams::find_pass(PassType type, PassMode mode) const
+{
+  for (const BufferPass &pass : passes) {
+    if (pass.type == type && pass.mode == mode) {
+      return &pass;
     }
-
-    return true;
   }
 
-  return false;
+  return nullptr;
 }
 
-bool RenderBuffers::set_pass_rect(PassType type, int components, float *pixels, int samples)
+const BufferPass *BufferParams::get_actual_display_pass(PassType type, PassMode mode) const
 {
-  if (buffer.data() == NULL) {
-    return false;
-  }
-
-  int pass_offset = 0;
+  const BufferPass *pass = find_pass(type, mode);
+  return get_actual_display_pass(pass);
+}
 
-  for (size_t j = 0; j < params.passes.size(); j++) {
-    Pass &pass = params.passes[j];
+const BufferPass *BufferParams::get_actual_display_pass(const BufferPass *pass) const
+{
+  if (!pass) {
+    return nullptr;
+  }
 
-    if (pass.type != type) {
-      pass_offset += pass.components;
-      continue;
+  if (pass->type == PASS_COMBINED) {
+    const BufferPass *shadow_catcher_matte_pass = find_pass(PASS_SHADOW_CATCHER_MATTE, pass->mode);
+    if (shadow_catcher_matte_pass) {
+      pass = shadow_catcher_matte_pass;
     }
+  }
 
-    float *out = buffer.data() + pass_offset;
-    int pass_stride = params.get_passes_size();
-    int size = params.width * params.height;
-
-    assert(pass.components == components);
+  return pass;
+}
 
-    for (int i = 0; i < size; i++, out += pass_stride, pixels += components) {
-      if (pass.filter) {
-        /* Scale by the number of samples, inverse of what we do in get_pass_rect.
-         * A better solution would be to remove the need for set_pass_rect entirely,
-         * and change baking to bake multiple objects in a tile at once. */
-        for (int j = 0; j < components; j++) {
-          out[j] = pixels[j] * samples;
-        }
-      }
-      else {
-        /* For non-filtered passes just straight copy, these may contain non-float data. */
-        memcpy(out, pixels, sizeof(float) * components);
-      }
-    }
+void BufferParams::update_offset_stride()
+{
+  offset = -(full_x + full_y * width);
+  stride = width;
+}
 
+bool BufferParams::modified(const BufferParams &other) const
+{
+  if (!(width == other.width && height == other.height && full_x == other.full_x &&
+        full_y == other.full_y && full_width == other.full_width &&
+        full_height == other.full_height && offset == other.offset && stride == other.stride &&
+        pass_stride == other.pass_stride && layer == other.layer && view == other.view &&
+        exposure == other.exposure &&
+        use_approximate_shadow_catcher == other.use_approximate_shadow_catcher &&
+        use_transparent_background == other.use_transparent_background)) {
     return true;
   }
 
-  return false;
+  return !(passes == other.passes);
 }
 
-/* Display Buffer */
+/* --------------------------------------------------------------------
+ * Render Buffers.
+ */
 
-DisplayBuffer::DisplayBuffer(Device *device, bool linear)
-    : draw_width(0),
-      draw_height(0),
-      transparent(true), /* todo: determine from background */
-      half_float(linear),
-      rgba_byte(device, "display buffer byte"),
-      rgba_half(device, "display buffer half")
+RenderBuffers::RenderBuffers(Device *device) : buffer(device, "RenderBuffers", MEM_READ_WRITE)
 {
 }
 
-DisplayBuffer::~DisplayBuffer()
+RenderBuffers::~RenderBuffers()
 {
-  rgba_byte.free();
-  rgba_half.free();
+  buffer.free();
 }
 
-void DisplayBuffer::reset(BufferParams &params_)
+void RenderBuffers::reset(const BufferParams &params_)
 {
-  draw_width = 0;
-  draw_height = 0;
+  DCHECK(params_.pass_stride != -1);
 
   params = params_;
 
-  /* allocate display pixels */
-  if (half_float) {
-    rgba_half.alloc_to_device(params.width, params.height);
-  }
-  else {
-    rgba_byte.alloc_to_device(params.width, params.height);
-  }
+  /* re-allocate buffer */
+  buffer.alloc(params.width * params.pass_stride, params.height);
 }
 
-void DisplayBuffer::draw_set(int width, int height)
+void RenderBuffers::zero()
 {
-  assert(width <= params.width && height <= params.height);
+  buffer.zero_to_device();
+}
 
-  draw_width = width;
-  draw_height = height;
+bool RenderBuffers::copy_from_device()
+{
+  DCHECK(params.pass_stride != -1);
+
+  if (!buffer.device_pointer)
+    return false;
+
+  buffer.copy_from_device(0, params.width * params.pass_stride, params.height);
+
+  return true;
 }
 
-void DisplayBuffer::draw(Device *device, const DeviceDrawParams &draw_params)
+void RenderBuffers::copy_to_device()
 {
-  if (draw_width != 0 && draw_height != 0) {
-    device_memory &rgba = (half_float) ? (device_memory &)rgba_half : (device_memory &)rgba_byte;
-
-    device->draw_pixels(rgba,
-                        0,
-                        draw_width,
-                        draw_height,
-                        params.width,
-                        params.height,
-                        params.full_x,
-                        params.full_y,
-                        params.full_width,
-                        params.full_height,
-                        transparent,
-                        draw_params);
-  }
+  buffer.copy_to_device();
 }
 
-bool DisplayBuffer::draw_ready()
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+                                       const BufferParams &dst_params,
+                                       const RenderBuffers *src,
+                                       const BufferParams &src_params,
+                                       const size_t src_offset)
 {
-  return (draw_width != 0 && draw_height != 0);
+  DCHECK_EQ(dst_params.width, src_params.width);
+  /* TODO(sergey): More sanity checks to avoid buffer overrun. */
+
+  /* Create a map of pass ofsets to be copied.
+   * Assume offsets are different to allow copying passes between buffers with different set of
+   * passes. */
+
+  struct {
+    int dst_offset;
+    int src_offset;
+  } pass_offsets[PASS_NUM];
+
+  int num_passes = 0;
+
+  for (int i = 0; i < PASS_NUM; ++i) {
+    const PassType pass_type = static_cast<PassType>(i);
+
+    const int dst_pass_offset = dst_params.get_pass_offset(pass_type, PassMode::DENOISED);
+    if (dst_pass_offset == PASS_UNUSED) {
+      continue;
+    }
+
+    const int src_pass_offset = src_params.get_pass_offset(pass_type, PassMode::DENOISED);
+    if (src_pass_offset == PASS_UNUSED) {
+      continue;
+    }
+
+    pass_offsets[num_passes].dst_offset = dst_pass_offset;
+    pass_offsets[num_passes].src_offset = src_pass_offset;
+    ++num_passes;
+  }
+
+  /* Copy passes. */
+  /* TODO(sergey): Make it more reusable, allowing implement copy of noisy passes. */
+
+  const int64_t dst_width = dst_params.width;
+  const int64_t dst_height = dst_params.height;
+  const int64_t dst_pass_stride = dst_params.pass_stride;
+  const int64_t dst_num_pixels = dst_width * dst_height;
+
+  const int64_t src_pass_stride = src_params.pass_stride;
+  const int64_t src_offset_in_floats = src_offset * src_pass_stride;
+
+  const float *src_pixel = src->buffer.data() + src_offset_in_floats;
+  float *dst_pixel = dst->buffer.data();
+
+  for (int i = 0; i < dst_num_pixels;
+       ++i, src_pixel += src_pass_stride, dst_pixel += dst_pass_stride) {
+    for (int pass_offset_idx = 0; pass_offset_idx < num_passes; ++pass_offset_idx) {
+      const int dst_pass_offset = pass_offsets[pass_offset_idx].dst_offset;
+      const int src_pass_offset = pass_offsets[pass_offset_idx].src_offset;
+
+      /* TODO(sergey): Support non-RGBA passes. */
+      dst_pixel[dst_pass_offset + 0] = src_pixel[src_pass_offset + 0];
+      dst_pixel[dst_pass_offset + 1] = src_pixel[src_pass_offset + 1];
+      dst_pixel[dst_pass_offset + 2] = src_pixel[src_pass_offset + 2];
+      dst_pixel[dst_pass_offset + 3] = src_pixel[src_pass_offset + 3];
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 4ffc628bb52..c048234167d 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -18,8 +18,8 @@
 #define __BUFFERS_H__
 
 #include "device/device_memory.h"
-
-#include "render/film.h"
+#include "graph/node.h"
+#include "render/pass.h"
 
 #include "kernel/kernel_types.h"
 
@@ -34,170 +34,156 @@ class Device;
 struct DeviceDrawParams;
 struct float4;
 
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferPass : public Node {
+ public:
+  NODE_DECLARE
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  ustring name;
+  bool include_albedo = false;
+
+  int offset = -1;
+
+  BufferPass();
+  explicit BufferPass(const Pass *scene_pass);
+
+  BufferPass(BufferPass &&other) noexcept = default;
+  BufferPass(const BufferPass &other) = default;
+
+  BufferPass &operator=(BufferPass &&other) = default;
+  BufferPass &operator=(const BufferPass &other) = default;
+
+  ~BufferPass() = default;
+
+  PassInfo get_info() const;
+
+  inline bool operator==(const BufferPass &other) const
+  {
+    return type == other.type && mode == other.mode && name == other.name &&
+           include_albedo == other.include_albedo && offset == other.offset;
+  }
+  inline bool operator!=(const BufferPass &other) const
+  {
+    return !(*this == other);
+  }
+};
+
 /* Buffer Parameters
  * Size of render buffer and how it fits in the full image (border render). */
 
-class BufferParams {
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferParams : public Node {
  public:
-  /* width/height of the physical buffer */
-  int width;
-  int height;
-
-  /* offset into and width/height of the full buffer */
-  int full_x;
-  int full_y;
-  int full_width;
-  int full_height;
-
-  /* passes */
-  vector<Pass> passes;
-  bool denoising_data_pass;
-  /* If only some light path types should be target, an additional pass is needed. */
-  bool denoising_clean_pass;
-  /* When we're prefiltering the passes during rendering, we need to keep both the
-   * original and the prefiltered data around because neighboring tiles might still
-   * need the original data. */
-  bool denoising_prefiltered_pass;
-
-  /* functions */
-  BufferParams();
+  NODE_DECLARE
 
-  void get_offset_stride(int &offset, int &stride);
-  bool modified(const BufferParams &params);
-  int get_passes_size();
-  int get_denoising_offset();
-  int get_denoising_prefiltered_offset();
-};
+  /* Width/height of the physical buffer. */
+  int width = 0;
+  int height = 0;
 
-/* Render Buffers */
+  /* Offset into and width/height of the full buffer. */
+  int full_x = 0;
+  int full_y = 0;
+  int full_width = 0;
+  int full_height = 0;
 
-class RenderBuffers {
- public:
-  /* buffer parameters */
-  BufferParams params;
+  /* Runtime fields, only valid after `update_passes()` or `update_offset_stride()`. */
+  int offset = -1, stride = -1;
 
-  /* float buffer */
-  device_vector<float> buffer;
-  bool map_neighbor_copied;
-  double render_time;
+  /* Runtime fields, only valid after `update_passes()`. */
+  int pass_stride = -1;
 
-  explicit RenderBuffers(Device *device);
-  ~RenderBuffers();
+  /* Properties which are used for accessing buffer pixels outside of scene graph. */
+  vector<BufferPass> passes;
+  ustring layer;
+  ustring view;
+  float exposure = 1.0f;
+  bool use_approximate_shadow_catcher = false;
+  bool use_transparent_background = false;
 
-  void reset(BufferParams &params);
-  void zero();
+  BufferParams();
 
-  bool copy_from_device();
-  bool get_pass_rect(
-      const string &name, float exposure, int sample, int components, float *pixels);
-  bool get_denoising_pass_rect(
-      int offset, float exposure, int sample, int components, float *pixels);
-  bool set_pass_rect(PassType type, int components, float *pixels, int samples);
-};
+  BufferParams(BufferParams &&other) noexcept = default;
+  BufferParams(const BufferParams &other) = default;
 
-/* Display Buffer
- *
- * The buffer used for drawing during render, filled by converting the render
- * buffers to byte of half float storage */
+  BufferParams &operator=(BufferParams &&other) = default;
+  BufferParams &operator=(const BufferParams &other) = default;
 
-class DisplayBuffer {
- public:
-  /* buffer parameters */
-  BufferParams params;
-  /* dimensions for how much of the buffer is actually ready for display.
-   * with progressive render we can be using only a subset of the buffer.
-   * if these are zero, it means nothing can be drawn yet */
-  int draw_width, draw_height;
-  /* draw alpha channel? */
-  bool transparent;
-  /* use half float? */
-  bool half_float;
-  /* byte buffer for converted result */
-  device_pixels<uchar4> rgba_byte;
-  device_pixels<half4> rgba_half;
-
-  DisplayBuffer(Device *device, bool linear = false);
-  ~DisplayBuffer();
-
-  void reset(BufferParams &params);
-
-  void draw_set(int width, int height);
-  void draw(Device *device, const DeviceDrawParams &draw_params);
-  bool draw_ready();
-};
+  ~BufferParams() = default;
 
-/* Render Tile
- * Rendering task on a buffer */
+  /* Pre-calculate all fields which depends on the passes.
+   *
+   * When the scene passes are given, the buffer passes will be created from them and stored in
+   * this params, and then params are updated for those passes.
+   * The `update_passes()` without parameters updates offsets and stries which are stored outside
+   * of the passes. */
+  void update_passes();
+  void update_passes(const vector<Pass *> &scene_passes);
 
-class RenderTile {
- public:
-  typedef enum { PATH_TRACE = (1 << 0), BAKE = (1 << 1), DENOISE = (1 << 2) } Task;
+  /* Returns PASS_UNUSED if there is no such pass in the buffer. */
+  int get_pass_offset(PassType type, PassMode mode = PassMode::NOISY) const;
 
-  Task task;
-  int x, y, w, h;
-  int start_sample;
-  int num_samples;
-  int sample;
-  int resolution;
-  int offset;
-  int stride;
-  int tile_index;
+  /* Returns nullptr if pass with given name does not exist. */
+  const BufferPass *find_pass(string_view name) const;
+  const BufferPass *find_pass(PassType type, PassMode mode = PassMode::NOISY) const;
 
-  device_ptr buffer;
-  int device_size;
+  /* Get display pass from its name.
+   * Will do special logic to replace combined pass with shadow catcher matte. */
+  const BufferPass *get_actual_display_pass(PassType type, PassMode mode = PassMode::NOISY) const;
+  const BufferPass *get_actual_display_pass(const BufferPass *pass) const;
 
-  typedef enum { NO_STEALING = 0, CAN_BE_STOLEN = 1, WAS_STOLEN = 2 } StealingState;
-  StealingState stealing_state;
+  void update_offset_stride();
 
-  RenderBuffers *buffers;
+  bool modified(const BufferParams &other) const;
 
-  RenderTile();
+ protected:
+  void reset_pass_offset();
 
-  int4 bounds() const
-  {
-    return make_int4(x,      /* xmin */
-                     y,      /* ymin */
-                     x + w,  /* xmax */
-                     y + h); /* ymax */
-  }
+  /* Multipled by 2 to be able to store noisy and denoised pass types. */
+  static constexpr int kNumPassOffsets = PASS_NUM * 2;
+
+  /* Indexed by an index derived from pass type and mode, indicates offset of the corresponding
+   * pass in the buffer.
+   * If there are multiple passes with same type and mode contains lowest offset of all of them. */
+  int pass_offset_[kNumPassOffsets];
 };
 
-/* Render Tile Neighbors
- * Set of neighboring tiles used for denoising. Tile order:
- *  0 1 2
- *  3 4 5
- *  6 7 8 */
+/* Render Buffers */
 
-class RenderTileNeighbors {
+class RenderBuffers {
  public:
-  static const int SIZE = 9;
-  static const int CENTER = 4;
+  /* buffer parameters */
+  BufferParams params;
 
-  RenderTile tiles[SIZE];
-  RenderTile target;
+  /* float buffer */
+  device_vector<float> buffer;
 
-  RenderTileNeighbors(const RenderTile &center)
-  {
-    tiles[CENTER] = center;
-  }
+  explicit RenderBuffers(Device *device);
+  ~RenderBuffers();
 
-  int4 bounds() const
-  {
-    return make_int4(tiles[3].x,               /* xmin */
-                     tiles[1].y,               /* ymin */
-                     tiles[5].x + tiles[5].w,  /* xmax */
-                     tiles[7].y + tiles[7].h); /* ymax */
-  }
+  void reset(const BufferParams &params);
+  void zero();
 
-  void set_bounds_from_center()
-  {
-    tiles[3].x = tiles[CENTER].x;
-    tiles[1].y = tiles[CENTER].y;
-    tiles[5].x = tiles[CENTER].x + tiles[CENTER].w;
-    tiles[7].y = tiles[CENTER].y + tiles[CENTER].h;
-  }
+  bool copy_from_device();
+  void copy_to_device();
 };
 
+/* Copy denoised passes form source to destination.
+ *
+ * Buffer parameters are provided explicitly, allowing to copy pixelks between render buffers which
+ * content corresponds to a render result at a non-unit resolution divider.
+ *
+ * `src_offset` allows to offset source pixel index which is used when a fraction of the source
+ * buffer is to be copied.
+ *
+ * Copy happens of the number of pixels in the destination. */
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+                                       const BufferParams &dst_params,
+                                       const RenderBuffers *src,
+                                       const BufferParams &src_params,
+                                       const size_t src_offset = 0);
+
 CCL_NAMESPACE_END
 
 #endif /* __BUFFERS_H__ */
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 327f166f9d8..8b69c971991 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -33,9 +33,9 @@
 
 /* needed for calculating differentials */
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_differential.h"
 #include "kernel/kernel_montecarlo.h"
@@ -169,7 +169,6 @@ Camera::Camera() : Node(get_node_type())
 
   width = 1024;
   height = 512;
-  resolution = 1;
 
   use_perspective_motion = false;
 
@@ -455,7 +454,6 @@ void Camera::update(Scene *scene)
   /* render size */
   kcam->width = width;
   kcam->height = height;
-  kcam->resolution = resolution;
 
   /* store differentials */
   kcam->dx = float3_to_float4(dx);
@@ -776,9 +774,11 @@ float Camera::world_to_raster_size(float3 P)
                            &ray);
 #endif
 
-    differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist);
+    /* TODO: would it help to use more accurate differentials here? */
+    differential3 dP;
+    differential_transfer_compact(&dP, ray.dP, ray.D, ray.dD, ray.D, dist);
 
-    return max(len(ray.dP.dx), len(ray.dP.dy));
+    return max(len(dP.dx), len(dP.dy));
   }
 
   return res;
@@ -789,12 +789,11 @@ bool Camera::use_motion() const
   return motion.size() > 1;
 }
 
-void Camera::set_screen_size_and_resolution(int width_, int height_, int resolution_)
+void Camera::set_screen_size(int width_, int height_)
 {
-  if (width_ != width || height_ != height || resolution_ != resolution) {
+  if (width_ != width || height_ != height) {
     width = width_;
     height = height_;
-    resolution = resolution_;
     tag_modified();
   }
 }
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 5abb4750764..cb8ecac1a7e 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -199,7 +199,6 @@ class Camera : public Node {
  private:
   int width;
   int height;
-  int resolution;
 
  public:
   /* functions */
@@ -225,7 +224,7 @@ class Camera : public Node {
   int motion_step(float time) const;
   bool use_motion() const;
 
-  void set_screen_size_and_resolution(int width_, int height_, int resolution_);
+  void set_screen_size(int width_, int height_);
 
  private:
   /* Private utility functions. */
diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp
deleted file mode 100644
index 99d4daa6961..00000000000
--- a/intern/cycles/render/coverage.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "render/coverage.h"
-#include "render/buffers.h"
-
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_id_passes.h"
-
-#include "util/util_map.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool crypomatte_comp(const pair<float, float> &i, const pair<float, float> j)
-{
-  return i.first > j.first;
-}
-
-void Coverage::finalize()
-{
-  int pass_offset = 0;
-  if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-    finalize_buffer(coverage_object, pass_offset);
-    pass_offset += kernel_data.film.cryptomatte_depth * 4;
-  }
-  if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-    finalize_buffer(coverage_material, pass_offset);
-    pass_offset += kernel_data.film.cryptomatte_depth * 4;
-  }
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-    finalize_buffer(coverage_asset, pass_offset);
-  }
-}
-
-void Coverage::init_path_trace()
-{
-  kg->coverage_object = kg->coverage_material = kg->coverage_asset = NULL;
-
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-      coverage_object.clear();
-      coverage_object.resize(tile.w * tile.h);
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-      coverage_material.clear();
-      coverage_material.resize(tile.w * tile.h);
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-      coverage_asset.clear();
-      coverage_asset.resize(tile.w * tile.h);
-    }
-  }
-}
-
-void Coverage::init_pixel(int x, int y)
-{
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    const int pixel_index = tile.w * (y - tile.y) + x - tile.x;
-    if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-      kg->coverage_object = &coverage_object[pixel_index];
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-      kg->coverage_material = &coverage_material[pixel_index];
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-      kg->coverage_asset = &coverage_asset[pixel_index];
-    }
-  }
-}
-
-void Coverage::finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    flatten_buffer(coverage, pass_offset);
-  }
-  else {
-    sort_buffer(pass_offset);
-  }
-}
-
-void Coverage::flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
-  /* Sort the coverage map and write it to the output */
-  int pixel_index = 0;
-  int pass_stride = tile.buffers->params.get_passes_size();
-  for (int y = 0; y < tile.h; ++y) {
-    for (int x = 0; x < tile.w; ++x) {
-      const CoverageMap &pixel = coverage[pixel_index];
-      if (!pixel.empty()) {
-        /* buffer offset */
-        int index = x + y * tile.stride;
-        float *buffer = (float *)tile.buffer + index * pass_stride;
-
-        /* sort the cryptomatte pixel */
-        vector<pair<float, float>> sorted_pixel;
-        for (CoverageMap::const_iterator it = pixel.begin(); it != pixel.end(); ++it) {
-          sorted_pixel.push_back(std::make_pair(it->second, it->first));
-        }
-        sort(sorted_pixel.begin(), sorted_pixel.end(), crypomatte_comp);
-        int num_slots = 2 * (kernel_data.film.cryptomatte_depth);
-        if (sorted_pixel.size() > num_slots) {
-          float leftover = 0.0f;
-          for (vector<pair<float, float>>::iterator it = sorted_pixel.begin() + num_slots;
-               it != sorted_pixel.end();
-               ++it) {
-            leftover += it->first;
-          }
-          sorted_pixel[num_slots - 1].first += leftover;
-        }
-        int limit = min(num_slots, sorted_pixel.size());
-        for (int i = 0; i < limit; ++i) {
-          kernel_write_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
-                                2 * (kernel_data.film.cryptomatte_depth),
-                                sorted_pixel[i].second,
-                                sorted_pixel[i].first);
-        }
-      }
-      ++pixel_index;
-    }
-  }
-}
-
-void Coverage::sort_buffer(const int pass_offset)
-{
-  /* Sort the coverage map and write it to the output */
-  int pass_stride = tile.buffers->params.get_passes_size();
-  for (int y = 0; y < tile.h; ++y) {
-    for (int x = 0; x < tile.w; ++x) {
-      /* buffer offset */
-      int index = x + y * tile.stride;
-      float *buffer = (float *)tile.buffer + index * pass_stride;
-      kernel_sort_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
-                           2 * (kernel_data.film.cryptomatte_depth));
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h
deleted file mode 100644
index 12182c614da..00000000000
--- a/intern/cycles/render/coverage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __COVERAGE_H__
-#define __COVERAGE_H__
-
-#include "util/util_map.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct KernelGlobals;
-class RenderTile;
-
-typedef unordered_map<float, float> CoverageMap;
-
-class Coverage {
- public:
-  Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_)
-  {
-  }
-  void init_path_trace();
-  void init_pixel(int x, int y);
-  void finalize();
-
- private:
-  vector<CoverageMap> coverage_object;
-  vector<CoverageMap> coverage_material;
-  vector<CoverageMap> coverage_asset;
-  KernelGlobals *kg;
-  RenderTile &tile;
-  void finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset);
-  void flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset);
-  void sort_buffer(const int pass_offset);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __COVERAGE_H__ */
diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp
index ddbe7484800..bcf8d3fa204 100644
--- a/intern/cycles/render/denoising.cpp
+++ b/intern/cycles/render/denoising.cpp
@@ -16,15 +16,17 @@
 
 #include "render/denoising.h"
 
-#include "kernel/filter/filter_defines.h"
+#if 0
 
-#include "util/util_foreach.h"
-#include "util/util_map.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_time.h"
+#  include "kernel/filter/filter_defines.h"
 
-#include <OpenImageIO/filesystem.h>
+#  include "util/util_foreach.h"
+#  include "util/util_map.h"
+#  include "util/util_system.h"
+#  include "util/util_task.h"
+#  include "util/util_time.h"
+
+#  include <OpenImageIO/filesystem.h>
 
 CCL_NAMESPACE_BEGIN
 
@@ -225,7 +227,7 @@ bool DenoiseImageLayer::match_channels(int neighbor,
 /* Denoise Task */
 
 DenoiseTask::DenoiseTask(Device *device,
-                         Denoiser *denoiser,
+                         DenoiserPipeline *denoiser,
                          int frame,
                          const vector<int> &neighbor_frames)
     : denoiser(denoiser),
@@ -386,7 +388,6 @@ void DenoiseTask::create_task(DeviceTask &task)
   task.denoising = denoiser->params;
   task.denoising.type = DENOISER_NLM;
   task.denoising.use = true;
-  task.denoising.store_passes = false;
   task.denoising_from_render = false;
 
   task.denoising_frames.resize(neighbor_frames.size());
@@ -863,7 +864,7 @@ bool DenoiseImage::save_output(const string &out_filepath, string &error)
 
 /* File pattern handling and outer loop over frames */
 
-Denoiser::Denoiser(DeviceInfo &device_info)
+DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info)
 {
   samples_override = 0;
   tile_size = make_int2(64, 64);
@@ -876,18 +877,16 @@ Denoiser::Denoiser(DeviceInfo &device_info)
   /* Initialize device. */
   device = Device::create(device_info, stats, profiler, true);
 
-  DeviceRequestedFeatures req;
-  req.use_denoising = true;
-  device->load_kernels(req);
+  device->load_kernels(KERNEL_FEATURE_DENOISING);
 }
 
-Denoiser::~Denoiser()
+DenoiserPipeline::~DenoiserPipeline()
 {
   delete device;
   TaskScheduler::exit();
 }
 
-bool Denoiser::run()
+bool DenoiserPipeline::run()
 {
   assert(input.size() == output.size());
 
@@ -931,3 +930,5 @@ bool Denoiser::run()
 }
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/render/denoising.h b/intern/cycles/render/denoising.h
index c1b4d0a5596..097cc570d06 100644
--- a/intern/cycles/render/denoising.h
+++ b/intern/cycles/render/denoising.h
@@ -17,27 +17,31 @@
 #ifndef __DENOISING_H__
 #define __DENOISING_H__
 
-#include "device/device.h"
-#include "device/device_denoising.h"
+#if 0
 
-#include "render/buffers.h"
+/* TODO(sergey): Make it explicit and clear when something is a denoiser, its pipeline or
+ * parameters. Currently it is an annoying mixture of terms used interchangeably. */
 
-#include "util/util_string.h"
-#include "util/util_unique_ptr.h"
-#include "util/util_vector.h"
+#  include "device/device.h"
 
-#include <OpenImageIO/imageio.h>
+#  include "render/buffers.h"
+
+#  include "util/util_string.h"
+#  include "util/util_unique_ptr.h"
+#  include "util/util_vector.h"
+
+#  include <OpenImageIO/imageio.h>
 
 OIIO_NAMESPACE_USING
 
 CCL_NAMESPACE_BEGIN
 
-/* Denoiser */
+/* Denoiser pipeline */
 
-class Denoiser {
+class DenoiserPipeline {
  public:
-  Denoiser(DeviceInfo &device_info);
-  ~Denoiser();
+  DenoiserPipeline(DeviceInfo &device_info);
+  ~DenoiserPipeline();
 
   bool run();
 
@@ -155,7 +159,10 @@ class DenoiseImage {
 
 class DenoiseTask {
  public:
-  DenoiseTask(Device *device, Denoiser *denoiser, int frame, const vector<int> &neighbor_frames);
+  DenoiseTask(Device *device,
+              DenoiserPipeline *denoiser,
+              int frame,
+              const vector<int> &neighbor_frames);
   ~DenoiseTask();
 
   /* Task stages */
@@ -168,7 +175,7 @@ class DenoiseTask {
 
  protected:
   /* Denoiser parameters and device */
-  Denoiser *denoiser;
+  DenoiserPipeline *denoiser;
   Device *device;
 
   /* Frame number to be denoised */
@@ -204,4 +211,6 @@ class DenoiseTask {
 
 CCL_NAMESPACE_END
 
+#endif
+
 #endif /* __DENOISING_H__ */
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 5df396394c4..8e14b338bd3 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -16,9 +16,12 @@
 
 #include "render/film.h"
 #include "device/device.h"
+#include "render/background.h"
+#include "render/bake.h"
 #include "render/camera.h"
 #include "render/integrator.h"
 #include "render/mesh.h"
+#include "render/object.h"
 #include "render/scene.h"
 #include "render/stats.h"
 #include "render/tables.h"
@@ -31,261 +34,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Pass */
-
-static bool compare_pass_order(const Pass &a, const Pass &b)
-{
-  if (a.components == b.components)
-    return (a.type < b.type);
-  return (a.components > b.components);
-}
-
-static NodeEnum *get_pass_type_enum()
-{
-  static NodeEnum pass_type_enum;
-  pass_type_enum.insert("combined", PASS_COMBINED);
-  pass_type_enum.insert("depth", PASS_DEPTH);
-  pass_type_enum.insert("normal", PASS_NORMAL);
-  pass_type_enum.insert("uv", PASS_UV);
-  pass_type_enum.insert("object_id", PASS_OBJECT_ID);
-  pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
-  pass_type_enum.insert("motion", PASS_MOTION);
-  pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
-  pass_type_enum.insert("render_time", PASS_RENDER_TIME);
-  pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
-  pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
-  pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
-  pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
-  pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
-  pass_type_enum.insert("mist", PASS_MIST);
-  pass_type_enum.insert("emission", PASS_EMISSION);
-  pass_type_enum.insert("background", PASS_BACKGROUND);
-  pass_type_enum.insert("ambient_occlusion", PASS_AO);
-  pass_type_enum.insert("shadow", PASS_SHADOW);
-  pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
-  pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
-  pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
-  pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
-  pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
-  pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
-  pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
-  pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
-  pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
-  pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
-  pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
-  pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
-  pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
-
-  return &pass_type_enum;
-}
-
-NODE_DEFINE(Pass)
-{
-  NodeType *type = NodeType::add("pass", create);
-
-  NodeEnum *pass_type_enum = get_pass_type_enum();
-  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
-  SOCKET_STRING(name, "Name", ustring());
-
-  return type;
-}
-
-Pass::Pass() : Node(get_node_type())
-{
-}
-
-void Pass::add(PassType type, vector<Pass> &passes, const char *name)
-{
-  for (size_t i = 0; i < passes.size(); i++) {
-    if (passes[i].type != type) {
-      continue;
-    }
-
-    /* An empty name is used as a placeholder to signal that any pass of
-     * that type is fine (because the content always is the same).
-     * This is important to support divide_type: If the pass that has a
-     * divide_type is added first, a pass for divide_type with an empty
-     * name will be added. Then, if a matching pass with a name is later
-     * requested, the existing placeholder will be renamed to that.
-     * If the divide_type is explicitly allocated with a name first and
-     * then again as part of another pass, the second one will just be
-     * skipped because that type already exists. */
-
-    /* If no name is specified, any pass of the correct type will match. */
-    if (name == NULL) {
-      return;
-    }
-
-    /* If we already have a placeholder pass, rename that one. */
-    if (passes[i].name.empty()) {
-      passes[i].name = name;
-      return;
-    }
-
-    /* If neither existing nor requested pass have placeholder name, they
-     * must match. */
-    if (name == passes[i].name) {
-      return;
-    }
-  }
-
-  Pass pass;
-
-  pass.type = type;
-  pass.filter = true;
-  pass.exposure = false;
-  pass.divide_type = PASS_NONE;
-  if (name) {
-    pass.name = name;
-  }
-
-  switch (type) {
-    case PASS_NONE:
-      pass.components = 0;
-      break;
-    case PASS_COMBINED:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_DEPTH:
-      pass.components = 1;
-      pass.filter = false;
-      break;
-    case PASS_MIST:
-      pass.components = 1;
-      break;
-    case PASS_NORMAL:
-      pass.components = 4;
-      break;
-    case PASS_UV:
-      pass.components = 4;
-      break;
-    case PASS_MOTION:
-      pass.components = 4;
-      pass.divide_type = PASS_MOTION_WEIGHT;
-      break;
-    case PASS_MOTION_WEIGHT:
-      pass.components = 1;
-      break;
-    case PASS_OBJECT_ID:
-    case PASS_MATERIAL_ID:
-      pass.components = 1;
-      pass.filter = false;
-      break;
-
-    case PASS_EMISSION:
-    case PASS_BACKGROUND:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_AO:
-      pass.components = 4;
-      break;
-    case PASS_SHADOW:
-      pass.components = 4;
-      pass.exposure = false;
-      break;
-    case PASS_LIGHT:
-      /* This isn't a real pass, used by baking to see whether
-       * light data is needed or not.
-       *
-       * Set components to 0 so pass sort below happens in a
-       * determined way.
-       */
-      pass.components = 0;
-      break;
-    case PASS_RENDER_TIME:
-      /* This pass is handled entirely on the host side. */
-      pass.components = 0;
-      break;
-
-    case PASS_DIFFUSE_COLOR:
-    case PASS_GLOSSY_COLOR:
-    case PASS_TRANSMISSION_COLOR:
-      pass.components = 4;
-      break;
-    case PASS_DIFFUSE_DIRECT:
-    case PASS_DIFFUSE_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_DIFFUSE_COLOR;
-      break;
-    case PASS_GLOSSY_DIRECT:
-    case PASS_GLOSSY_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_GLOSSY_COLOR;
-      break;
-    case PASS_TRANSMISSION_DIRECT:
-    case PASS_TRANSMISSION_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_TRANSMISSION_COLOR;
-      break;
-    case PASS_VOLUME_DIRECT:
-    case PASS_VOLUME_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_CRYPTOMATTE:
-      pass.components = 4;
-      break;
-    case PASS_ADAPTIVE_AUX_BUFFER:
-      pass.components = 4;
-      break;
-    case PASS_SAMPLE_COUNT:
-      pass.components = 1;
-      pass.exposure = false;
-      break;
-    case PASS_AOV_COLOR:
-      pass.components = 4;
-      break;
-    case PASS_AOV_VALUE:
-      pass.components = 1;
-      break;
-    case PASS_BAKE_PRIMITIVE:
-    case PASS_BAKE_DIFFERENTIAL:
-      pass.components = 4;
-      pass.exposure = false;
-      pass.filter = false;
-      break;
-    default:
-      assert(false);
-      break;
-  }
-
-  passes.push_back(pass);
-
-  /* Order from by components, to ensure alignment so passes with size 4
-   * come first and then passes with size 1. Note this must use stable sort
-   * so cryptomatte passes remain in the right order. */
-  stable_sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
-
-  if (pass.divide_type != PASS_NONE)
-    Pass::add(pass.divide_type, passes);
-}
-
-bool Pass::equals(const vector<Pass> &A, const vector<Pass> &B)
-{
-  if (A.size() != B.size())
-    return false;
-
-  for (int i = 0; i < A.size(); i++)
-    if (A[i].type != B[i].type || A[i].name != B[i].name)
-      return false;
-
-  return true;
-}
-
-bool Pass::contains(const vector<Pass> &passes, PassType type)
-{
-  for (size_t i = 0; i < passes.size(); i++)
-    if (passes[i].type == type)
-      return true;
-
-  return false;
-}
-
 /* Pixel Filter */
 
 static float filter_func_box(float /*v*/, float /*width*/)
@@ -368,17 +116,11 @@ NODE_DEFINE(Film)
   SOCKET_FLOAT(mist_depth, "Mist Depth", 100.0f);
   SOCKET_FLOAT(mist_falloff, "Mist Falloff", 1.0f);
 
-  SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
-  SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
-  SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
-  SOCKET_INT(denoising_flags, "Denoising Flags", 0);
-  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
-
-  SOCKET_BOOLEAN(use_light_visibility, "Use Light Visibility", false);
-
-  NodeEnum *pass_type_enum = get_pass_type_enum();
+  const NodeEnum *pass_type_enum = Pass::get_type_enum();
   SOCKET_ENUM(display_pass, "Display Pass", *pass_type_enum, PASS_COMBINED);
 
+  SOCKET_BOOLEAN(show_active_pixels, "Show Active Pixels", false);
+
   static NodeEnum cryptomatte_passes_enum;
   cryptomatte_passes_enum.insert("none", CRYPT_NONE);
   cryptomatte_passes_enum.insert("object", CRYPT_OBJECT);
@@ -389,15 +131,13 @@ NODE_DEFINE(Film)
 
   SOCKET_INT(cryptomatte_depth, "Cryptomatte Depth", 0);
 
+  SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+
   return type;
 }
 
-Film::Film() : Node(get_node_type())
+Film::Film() : Node(get_node_type()), filter_table_offset_(TABLE_OFFSET_INVALID)
 {
-  use_light_visibility = false;
-  filter_table_offset = TABLE_OFFSET_INVALID;
-  cryptomatte_passes = CRYPT_NONE;
-  display_pass = PASS_COMBINED;
 }
 
 Film::~Film()
@@ -406,7 +146,8 @@ Film::~Film()
 
 void Film::add_default(Scene *scene)
 {
-  Pass::add(PASS_COMBINED, scene->passes);
+  Pass *pass = scene->create_node<Pass>();
+  pass->set_type(PASS_COMBINED);
 }
 
 void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
@@ -426,50 +167,77 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 
   /* update __data */
   kfilm->exposure = exposure;
+  kfilm->pass_alpha_threshold = pass_alpha_threshold;
   kfilm->pass_flag = 0;
 
-  kfilm->display_pass_stride = -1;
-  kfilm->display_pass_components = 0;
-  kfilm->display_divide_pass_stride = -1;
-  kfilm->use_display_exposure = false;
-  kfilm->use_display_pass_alpha = (display_pass == PASS_COMBINED);
+  kfilm->use_approximate_shadow_catcher = get_use_approximate_shadow_catcher();
 
   kfilm->light_pass_flag = 0;
   kfilm->pass_stride = 0;
-  kfilm->use_light_pass = use_light_visibility;
-  kfilm->pass_aov_value_num = 0;
-  kfilm->pass_aov_color_num = 0;
+
+  /* Mark with PASS_UNUSED to avoid mask test in the kernel. */
+  kfilm->pass_background = PASS_UNUSED;
+  kfilm->pass_emission = PASS_UNUSED;
+  kfilm->pass_ao = PASS_UNUSED;
+  kfilm->pass_diffuse_direct = PASS_UNUSED;
+  kfilm->pass_diffuse_indirect = PASS_UNUSED;
+  kfilm->pass_glossy_direct = PASS_UNUSED;
+  kfilm->pass_glossy_indirect = PASS_UNUSED;
+  kfilm->pass_transmission_direct = PASS_UNUSED;
+  kfilm->pass_transmission_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_shadow = PASS_UNUSED;
+
+  /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
+  kfilm->pass_denoising_normal = PASS_UNUSED;
+  kfilm->pass_denoising_albedo = PASS_UNUSED;
+  kfilm->pass_sample_count = PASS_UNUSED;
+  kfilm->pass_adaptive_aux_buffer = PASS_UNUSED;
+  kfilm->pass_shadow_catcher = PASS_UNUSED;
+  kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED;
+  kfilm->pass_shadow_catcher_matte = PASS_UNUSED;
 
   bool have_cryptomatte = false;
+  bool have_aov_color = false;
+  bool have_aov_value = false;
 
   for (size_t i = 0; i < scene->passes.size(); i++) {
-    Pass &pass = scene->passes[i];
+    const Pass *pass = scene->passes[i];
 
-    if (pass.type == PASS_NONE) {
+    if (pass->get_type() == PASS_NONE || !pass->is_written()) {
+      continue;
+    }
+
+    if (pass->get_mode() == PassMode::DENOISED) {
+      /* Generally we only storing offsets of the noisy passes. The display pass is an exception
+       * since it is a read operation and not a write. */
+      kfilm->pass_stride += pass->get_info().num_components;
       continue;
     }
 
     /* Can't do motion pass if no motion vectors are available. */
-    if (pass.type == PASS_MOTION || pass.type == PASS_MOTION_WEIGHT) {
+    if (pass->get_type() == PASS_MOTION || pass->get_type() == PASS_MOTION_WEIGHT) {
       if (scene->need_motion() != Scene::MOTION_PASS) {
-        kfilm->pass_stride += pass.components;
+        kfilm->pass_stride += pass->get_info().num_components;
         continue;
       }
     }
 
-    int pass_flag = (1 << (pass.type % 32));
-    if (pass.type <= PASS_CATEGORY_MAIN_END) {
-      kfilm->pass_flag |= pass_flag;
-    }
-    else if (pass.type <= PASS_CATEGORY_LIGHT_END) {
-      kfilm->use_light_pass = 1;
+    const int pass_flag = (1 << (pass->get_type() % 32));
+    if (pass->get_type() <= PASS_CATEGORY_LIGHT_END) {
       kfilm->light_pass_flag |= pass_flag;
     }
+    else if (pass->get_type() <= PASS_CATEGORY_DATA_END) {
+      kfilm->pass_flag |= pass_flag;
+    }
     else {
-      assert(pass.type <= PASS_CATEGORY_BAKE_END);
+      assert(pass->get_type() <= PASS_CATEGORY_BAKE_END);
     }
 
-    switch (pass.type) {
+    switch (pass->get_type()) {
       case PASS_COMBINED:
         kfilm->pass_combined = kfilm->pass_stride;
         break;
@@ -479,6 +247,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_NORMAL:
         kfilm->pass_normal = kfilm->pass_stride;
         break;
+      case PASS_POSITION:
+        kfilm->pass_position = kfilm->pass_stride;
+        break;
+      case PASS_ROUGHNESS:
+        kfilm->pass_roughness = kfilm->pass_stride;
+        break;
       case PASS_UV:
         kfilm->pass_uv = kfilm->pass_stride;
         break;
@@ -511,9 +285,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
         kfilm->pass_shadow = kfilm->pass_stride;
         break;
 
-      case PASS_LIGHT:
-        break;
-
       case PASS_DIFFUSE_COLOR:
         kfilm->pass_diffuse_color = kfilm->pass_stride;
         break;
@@ -563,78 +334,56 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                                       kfilm->pass_stride;
         have_cryptomatte = true;
         break;
+
+      case PASS_DENOISING_NORMAL:
+        kfilm->pass_denoising_normal = kfilm->pass_stride;
+        break;
+      case PASS_DENOISING_ALBEDO:
+        kfilm->pass_denoising_albedo = kfilm->pass_stride;
+        break;
+
+      case PASS_SHADOW_CATCHER:
+        kfilm->pass_shadow_catcher = kfilm->pass_stride;
+        break;
+      case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+        kfilm->pass_shadow_catcher_sample_count = kfilm->pass_stride;
+        break;
+      case PASS_SHADOW_CATCHER_MATTE:
+        kfilm->pass_shadow_catcher_matte = kfilm->pass_stride;
+        break;
+
       case PASS_ADAPTIVE_AUX_BUFFER:
         kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
         break;
       case PASS_SAMPLE_COUNT:
         kfilm->pass_sample_count = kfilm->pass_stride;
         break;
+
       case PASS_AOV_COLOR:
-        if (kfilm->pass_aov_color_num == 0) {
+        if (!have_aov_color) {
           kfilm->pass_aov_color = kfilm->pass_stride;
+          have_aov_color = true;
         }
-        kfilm->pass_aov_color_num++;
         break;
       case PASS_AOV_VALUE:
-        if (kfilm->pass_aov_value_num == 0) {
+        if (!have_aov_value) {
           kfilm->pass_aov_value = kfilm->pass_stride;
+          have_aov_value = true;
         }
-        kfilm->pass_aov_value_num++;
         break;
       default:
         assert(false);
         break;
     }
 
-    if (pass.type == display_pass) {
-      kfilm->display_pass_stride = kfilm->pass_stride;
-      kfilm->display_pass_components = pass.components;
-      kfilm->use_display_exposure = pass.exposure && (kfilm->exposure != 1.0f);
-    }
-    else if (pass.type == PASS_DIFFUSE_COLOR || pass.type == PASS_TRANSMISSION_COLOR ||
-             pass.type == PASS_GLOSSY_COLOR) {
-      kfilm->display_divide_pass_stride = kfilm->pass_stride;
-    }
-
-    kfilm->pass_stride += pass.components;
-  }
-
-  kfilm->pass_denoising_data = 0;
-  kfilm->pass_denoising_clean = 0;
-  kfilm->denoising_flags = 0;
-  if (denoising_data_pass) {
-    kfilm->pass_denoising_data = kfilm->pass_stride;
-    kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
-    kfilm->denoising_flags = denoising_flags;
-    if (denoising_clean_pass) {
-      kfilm->pass_denoising_clean = kfilm->pass_stride;
-      kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
-      kfilm->use_light_pass = 1;
-    }
-    if (denoising_prefiltered_pass) {
-      kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED;
-    }
-  }
-
-  kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
-
-  /* When displaying the normal/uv pass in the viewport we need to disable
-   * transparency.
-   *
-   * We also don't need to perform light accumulations. Later we want to optimize this to suppress
-   * light calculations. */
-  if (display_pass == PASS_NORMAL || display_pass == PASS_UV) {
-    kfilm->use_light_pass = 0;
-  }
-  else {
-    kfilm->pass_alpha_threshold = pass_alpha_threshold;
+    kfilm->pass_stride += pass->get_info().num_components;
   }
 
   /* update filter table */
   vector<float> table = filter_table(filter_type, filter_width);
-  scene->lookup_tables->remove_table(&filter_table_offset);
-  filter_table_offset = scene->lookup_tables->add_table(dscene, table);
-  kfilm->filter_table_offset = (int)filter_table_offset;
+  scene->lookup_tables->remove_table(&filter_table_offset_);
+  filter_table_offset_ = scene->lookup_tables->add_table(dscene, table);
+  kfilm->filter_table_offset = (int)filter_table_offset_;
 
   /* mist pass parameters */
   kfilm->mist_start = mist_start;
@@ -644,79 +393,298 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
   kfilm->cryptomatte_passes = cryptomatte_passes;
   kfilm->cryptomatte_depth = cryptomatte_depth;
 
-  pass_stride = kfilm->pass_stride;
-  denoising_data_offset = kfilm->pass_denoising_data;
-  denoising_clean_offset = kfilm->pass_denoising_clean;
-
   clear_modified();
 }
 
 void Film::device_free(Device * /*device*/, DeviceScene * /*dscene*/, Scene *scene)
 {
-  scene->lookup_tables->remove_table(&filter_table_offset);
+  scene->lookup_tables->remove_table(&filter_table_offset_);
 }
 
-void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes)
+int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
 {
-  if (Pass::contains(scene->passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) {
-    scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
+  int offset_color = 0, offset_value = 0;
+  foreach (const Pass *pass, scene->passes) {
+    if (pass->get_name() == name) {
+      if (pass->get_type() == PASS_AOV_VALUE) {
+        is_color = false;
+        return offset_value;
+      }
+      else if (pass->get_type() == PASS_AOV_COLOR) {
+        is_color = true;
+        return offset_color;
+      }
+    }
+
+    if (pass->get_type() == PASS_AOV_VALUE) {
+      offset_value += pass->get_info().num_components;
+    }
+    else if (pass->get_type() == PASS_AOV_COLOR) {
+      offset_color += pass->get_info().num_components;
+    }
+  }
+
+  return -1;
+}
+
+void Film::update_passes(Scene *scene, bool add_sample_count_pass)
+{
+  const Background *background = scene->background;
+  const BakeManager *bake_manager = scene->bake_manager;
+  const ObjectManager *object_manager = scene->object_manager;
+  Integrator *integrator = scene->integrator;
+
+  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) {
+    return;
+  }
+
+  /* Remove auto generated passes and recreate them. */
+  remove_auto_passes(scene);
+
+  /* Display pass for viewport. */
+  const PassType display_pass = get_display_pass();
+  add_auto_pass(scene, display_pass);
+
+  /* Assumption is that a combined pass always exists for now, for example
+   * adaptive sampling is always based on a combined pass. But we should
+   * try to lift this limitation in the future for faster rendering of
+   * individual passes. */
+  if (display_pass != PASS_COMBINED) {
+    add_auto_pass(scene, PASS_COMBINED);
+  }
+
+  /* Create passes needed for adaptive sampling. */
+  const AdaptiveSampling adaptive_sampling = integrator->get_adaptive_sampling();
+  if (adaptive_sampling.use) {
+    add_auto_pass(scene, PASS_SAMPLE_COUNT);
+    add_auto_pass(scene, PASS_ADAPTIVE_AUX_BUFFER);
+  }
+
+  /* Create passes needed for denoising. */
+  const bool use_denoise = integrator->get_use_denoise();
+  if (use_denoise) {
+    if (integrator->get_use_denoise_pass_normal()) {
+      add_auto_pass(scene, PASS_DENOISING_NORMAL);
+    }
+    if (integrator->get_use_denoise_pass_albedo()) {
+      add_auto_pass(scene, PASS_DENOISING_ALBEDO);
+    }
+  }
+
+  /* Create passes for shadow catcher. */
+  if (scene->has_shadow_catcher()) {
+    const bool need_background = get_use_approximate_shadow_catcher() &&
+                                 !background->get_transparent();
+
+    add_auto_pass(scene, PASS_SHADOW_CATCHER);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_MATTE);
+
+    if (need_background) {
+      add_auto_pass(scene, PASS_BACKGROUND);
+    }
+  }
+  else if (Pass::contains(scene->passes, PASS_SHADOW_CATCHER)) {
+    add_auto_pass(scene, PASS_SHADOW_CATCHER);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  }
+
+  const vector<Pass *> passes_immutable = scene->passes;
+  for (const Pass *pass : passes_immutable) {
+    const PassInfo info = pass->get_info();
+    /* Add utility passes needed to generate some light passes. */
+    if (info.divide_type != PASS_NONE) {
+      add_auto_pass(scene, info.divide_type);
+    }
+    if (info.direct_type != PASS_NONE) {
+      add_auto_pass(scene, info.direct_type);
+    }
+    if (info.indirect_type != PASS_NONE) {
+      add_auto_pass(scene, info.indirect_type);
+    }
+
+    /* NOTE: Enable all denoised passes when storage is requested.
+     * This way it is possible to tweak denoiser parameters later on. */
+    if (info.support_denoise && use_denoise) {
+      add_auto_pass(scene, pass->get_type(), PassMode::DENOISED);
+    }
+  }
+
+  if (bake_manager->get_baking()) {
+    add_auto_pass(scene, PASS_BAKE_PRIMITIVE, "BakePrimitive");
+    add_auto_pass(scene, PASS_BAKE_DIFFERENTIAL, "BakeDifferential");
+  }
+
+  if (add_sample_count_pass) {
+    if (!Pass::contains(scene->passes, PASS_SAMPLE_COUNT)) {
+      add_auto_pass(scene, PASS_SAMPLE_COUNT);
+    }
+  }
+
+  /* Remove duplicates and initialize internal pass info. */
+  finalize_passes(scene, use_denoise);
 
+  /* Flush scene updates. */
+  const bool have_uv_pass = Pass::contains(scene->passes, PASS_UV);
+  const bool have_motion_pass = Pass::contains(scene->passes, PASS_MOTION);
+  const bool have_ao_pass = Pass::contains(scene->passes, PASS_AO);
+
+  if (have_uv_pass != prev_have_uv_pass) {
+    scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
     foreach (Shader *shader, scene->shaders)
       shader->need_update_uvs = true;
   }
-  else if (Pass::contains(scene->passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) {
+  if (have_motion_pass != prev_have_motion_pass) {
     scene->geometry_manager->tag_update(scene, GeometryManager::MOTION_PASS_NEEDED);
   }
-  else if (Pass::contains(scene->passes, PASS_AO) != Pass::contains(passes_, PASS_AO)) {
+  if (have_ao_pass != prev_have_ao_pass) {
     scene->integrator->tag_update(scene, Integrator::AO_PASS_MODIFIED);
   }
 
-  if (update_passes) {
-    scene->passes = passes_;
+  prev_have_uv_pass = have_uv_pass;
+  prev_have_motion_pass = have_motion_pass;
+  prev_have_ao_pass = have_ao_pass;
+
+  tag_modified();
+
+  /* Debug logging. */
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Effective scene passes:";
+    for (const Pass *pass : scene->passes) {
+      VLOG(2) << "- " << *pass;
+    }
   }
 }
 
-int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
+void Film::add_auto_pass(Scene *scene, PassType type, const char *name)
 {
-  int num_color = 0, num_value = 0;
-  foreach (const Pass &pass, scene->passes) {
-    if (pass.type == PASS_AOV_COLOR) {
-      num_color++;
-    }
-    else if (pass.type == PASS_AOV_VALUE) {
-      num_value++;
+  add_auto_pass(scene, type, PassMode::NOISY, name);
+}
+
+void Film::add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name)
+{
+  Pass *pass = new Pass();
+  pass->set_type(type);
+  pass->set_mode(mode);
+  pass->set_name(ustring((name) ? name : ""));
+  pass->is_auto_ = true;
+
+  pass->set_owner(scene);
+  scene->passes.push_back(pass);
+}
+
+void Film::remove_auto_passes(Scene *scene)
+{
+  /* Remove all passes which were automatically created. */
+  vector<Pass *> new_passes;
+
+  for (Pass *pass : scene->passes) {
+    if (!pass->is_auto_) {
+      new_passes.push_back(pass);
     }
     else {
-      continue;
-    }
-
-    if (pass.name == name) {
-      is_color = (pass.type == PASS_AOV_COLOR);
-      return (is_color ? num_color : num_value) - 1;
+      delete pass;
     }
   }
 
-  return -1;
+  scene->passes = new_passes;
 }
 
-int Film::get_pass_stride() const
+static bool compare_pass_order(const Pass *a, const Pass *b)
 {
-  return pass_stride;
-}
+  const int num_components_a = a->get_info().num_components;
+  const int num_components_b = b->get_info().num_components;
 
-int Film::get_denoising_data_offset() const
-{
-  return denoising_data_offset;
+  if (num_components_a == num_components_b) {
+    return (a->get_type() < b->get_type());
+  }
+
+  return num_components_a > num_components_b;
 }
 
-int Film::get_denoising_clean_offset() const
+void Film::finalize_passes(Scene *scene, const bool use_denoise)
 {
-  return denoising_clean_offset;
+  /* Remove duplicate passes. */
+  vector<Pass *> new_passes;
+
+  for (Pass *pass : scene->passes) {
+    /* Disable denoising on passes if denoising is disabled, or if the
+     * pass does not support it. */
+    pass->set_mode((use_denoise && pass->get_info().support_denoise) ? pass->get_mode() :
+                                                                       PassMode::NOISY);
+
+    /* Merge duplicate passes. */
+    bool duplicate_found = false;
+    for (Pass *new_pass : new_passes) {
+      /* If different type or denoising, don't merge. */
+      if (new_pass->get_type() != pass->get_type() || new_pass->get_mode() != pass->get_mode()) {
+        continue;
+      }
+
+      /* If both passes have a name and the names are different, don't merge.
+       * If either pass has a name, we'll use that name. */
+      if (!pass->get_name().empty() && !new_pass->get_name().empty() &&
+          pass->get_name() != new_pass->get_name()) {
+        continue;
+      }
+
+      if (!pass->get_name().empty() && new_pass->get_name().empty()) {
+        new_pass->set_name(pass->get_name());
+      }
+
+      new_pass->is_auto_ &= pass->is_auto_;
+      duplicate_found = true;
+      break;
+    }
+
+    if (!duplicate_found) {
+      new_passes.push_back(pass);
+    }
+    else {
+      delete pass;
+    }
+  }
+
+  /* Order from by components and type, This is required to for AOVs and cryptomatte passes,
+   * which the kernel assumes to be in order. Note this must use stable sort so cryptomatte
+   * passes remain in the right order. */
+  stable_sort(new_passes.begin(), new_passes.end(), compare_pass_order);
+
+  scene->passes = new_passes;
 }
 
-size_t Film::get_filter_table_offset() const
+uint Film::get_kernel_features(const Scene *scene) const
 {
-  return filter_table_offset;
+  uint kernel_features = 0;
+
+  for (const Pass *pass : scene->passes) {
+    if (!pass->is_written()) {
+      continue;
+    }
+
+    const PassType pass_type = pass->get_type();
+    const PassMode pass_mode = pass->get_mode();
+
+    if (pass_mode == PassMode::DENOISED || pass_type == PASS_DENOISING_NORMAL ||
+        pass_type == PASS_DENOISING_ALBEDO) {
+      kernel_features |= KERNEL_FEATURE_DENOISING;
+    }
+
+    if (pass_type != PASS_NONE && pass_type != PASS_COMBINED &&
+        pass_type <= PASS_CATEGORY_LIGHT_END) {
+      kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
+
+      if (pass_type == PASS_SHADOW) {
+        kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+      }
+    }
+
+    if (pass_type == PASS_AO) {
+      kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
+    }
+  }
+
+  return kernel_features;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 462a7275491..5d327353361 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -17,6 +17,7 @@
 #ifndef __FILM_H__
 #define __FILM_H__
 
+#include "render/pass.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
@@ -38,36 +39,15 @@ typedef enum FilterType {
   FILTER_NUM_TYPES,
 } FilterType;
 
-class Pass : public Node {
- public:
-  NODE_DECLARE
-
-  Pass();
-
-  PassType type;
-  int components;
-  bool filter;
-  bool exposure;
-  PassType divide_type;
-  ustring name;
-
-  static void add(PassType type, vector<Pass> &passes, const char *name = NULL);
-  static bool equals(const vector<Pass> &A, const vector<Pass> &B);
-  static bool contains(const vector<Pass> &passes, PassType);
-};
-
 class Film : public Node {
  public:
   NODE_DECLARE
 
   NODE_SOCKET_API(float, exposure)
-  NODE_SOCKET_API(bool, denoising_data_pass)
-  NODE_SOCKET_API(bool, denoising_clean_pass)
-  NODE_SOCKET_API(bool, denoising_prefiltered_pass)
-  NODE_SOCKET_API(int, denoising_flags)
   NODE_SOCKET_API(float, pass_alpha_threshold)
 
   NODE_SOCKET_API(PassType, display_pass)
+  NODE_SOCKET_API(bool, show_active_pixels)
 
   NODE_SOCKET_API(FilterType, filter_type)
   NODE_SOCKET_API(float, filter_width)
@@ -76,17 +56,18 @@ class Film : public Node {
   NODE_SOCKET_API(float, mist_depth)
   NODE_SOCKET_API(float, mist_falloff)
 
-  NODE_SOCKET_API(bool, use_light_visibility)
   NODE_SOCKET_API(CryptomatteType, cryptomatte_passes)
   NODE_SOCKET_API(int, cryptomatte_depth)
 
-  NODE_SOCKET_API(bool, use_adaptive_sampling)
+  /* Approximate shadow catcher pass into its matte pass, so that both artificial objects and
+   * shadows can be alpha-overed onto a backdrop. */
+  NODE_SOCKET_API(bool, use_approximate_shadow_catcher)
 
  private:
-  int pass_stride;
-  int denoising_data_offset;
-  int denoising_clean_offset;
-  size_t filter_table_offset;
+  size_t filter_table_offset_;
+  bool prev_have_uv_pass = false;
+  bool prev_have_motion_pass = false;
+  bool prev_have_ao_pass = false;
 
  public:
   Film();
@@ -98,14 +79,20 @@ class Film : public Node {
   void device_update(Device *device, DeviceScene *dscene, Scene *scene);
   void device_free(Device *device, DeviceScene *dscene, Scene *scene);
 
-  void tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes = true);
-
   int get_aov_offset(Scene *scene, string name, bool &is_color);
 
-  int get_pass_stride() const;
-  int get_denoising_data_offset() const;
-  int get_denoising_clean_offset() const;
-  size_t get_filter_table_offset() const;
+  /* Update passes so that they contain all passes required for the configured functionality.
+   *
+   * If `add_sample_count_pass` is true then the SAMPLE_COUNT pass is ensured to be added. */
+  void update_passes(Scene *scene, bool add_sample_count_pass);
+
+  uint get_kernel_features(const Scene *scene) const;
+
+ private:
+  void add_auto_pass(Scene *scene, PassType type, const char *name = nullptr);
+  void add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name = nullptr);
+  void remove_auto_passes(Scene *scene);
+  void finalize_passes(Scene *scene, const bool use_denoise);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp
index 7ec1d2d9abb..6804a006fe6 100644
--- a/intern/cycles/render/geometry.cpp
+++ b/intern/cycles/render/geometry.cpp
@@ -215,6 +215,12 @@ void Geometry::compute_bvh(
       msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
 
     Object object;
+
+    /* Ensure all visibility bits are set at the geometry level BVH. In
+     * the object level BVH is where actual visibility is tested. */
+    object.set_is_shadow_catcher(true);
+    object.set_visibility(~0);
+
     object.set_geometry(this);
 
     vector<Geometry *> geometry;
@@ -315,7 +321,7 @@ void GeometryManager::update_osl_attributes(Device *device,
 {
 #ifdef WITH_OSL
   /* for OSL, a hash map is used to lookup the attribute by name. */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   og->object_name_map.clear();
   og->attribute_map.clear();
@@ -1855,8 +1861,8 @@ void GeometryManager::device_update(Device *device,
     });
 
     Camera *dicing_camera = scene->dicing_camera;
-    dicing_camera->set_screen_size_and_resolution(
-        dicing_camera->get_full_width(), dicing_camera->get_full_height(), 1);
+    dicing_camera->set_screen_size(dicing_camera->get_full_width(),
+                                   dicing_camera->get_full_height());
     dicing_camera->update(scene);
 
     size_t i = 0;
@@ -2157,7 +2163,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc
   dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE;
 
 #ifdef WITH_OSL
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   if (og) {
     og->object_name_map.clear();
diff --git a/intern/cycles/render/gpu_display.cpp b/intern/cycles/render/gpu_display.cpp
new file mode 100644
index 00000000000..a8f0cc50583
--- /dev/null
+++ b/intern/cycles/render/gpu_display.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/gpu_display.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void GPUDisplay::reset(const BufferParams &buffer_params)
+{
+  thread_scoped_lock lock(mutex_);
+
+  const GPUDisplayParams old_params = params_;
+
+  params_.offset = make_int2(buffer_params.full_x, buffer_params.full_y);
+  params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height);
+  params_.size = make_int2(buffer_params.width, buffer_params.height);
+
+  /* If the parameters did change tag texture as unusable. This avoids drawing old texture content
+   * in an updated configuration of the viewport. For example, avoids drawing old frame when render
+   * border did change.
+   * If the parameters did not change, allow drawing the current state of the texture, which will
+   * not count as an up-to-date redraw. This will avoid flickering when doping camera navigation by
+   * showing a previously rendered frame for until the new one is ready. */
+  if (old_params.modified(params_)) {
+    texture_state_.is_usable = false;
+  }
+
+  texture_state_.is_outdated = true;
+}
+
+void GPUDisplay::mark_texture_updated()
+{
+  texture_state_.is_outdated = false;
+  texture_state_.is_usable = true;
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool GPUDisplay::update_begin(int texture_width, int texture_height)
+{
+  DCHECK(!update_state_.is_active);
+
+  if (update_state_.is_active) {
+    LOG(ERROR) << "Attempt to re-activate update process.";
+    return false;
+  }
+
+  /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+   * The update itself is non-blocking however, for better performance and to avoid
+   * potential deadlocks due to locks held by the subclass. */
+  GPUDisplayParams params;
+  {
+    thread_scoped_lock lock(mutex_);
+    params = params_;
+    texture_state_.size = make_int2(texture_width, texture_height);
+  }
+
+  if (!do_update_begin(params, texture_width, texture_height)) {
+    LOG(ERROR) << "GPUDisplay implementation could not begin update.";
+    return false;
+  }
+
+  update_state_.is_active = true;
+
+  return true;
+}
+
+void GPUDisplay::update_end()
+{
+  DCHECK(update_state_.is_active);
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to deactivate inactive update process.";
+    return;
+  }
+
+  do_update_end();
+
+  update_state_.is_active = false;
+}
+
+int2 GPUDisplay::get_texture_size() const
+{
+  return texture_state_.size;
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void GPUDisplay::copy_pixels_to_texture(
+    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+  DCHECK(update_state_.is_active);
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    return;
+  }
+
+  mark_texture_updated();
+  do_copy_pixels_to_texture(rgba_pixels, texture_x, texture_y, pixels_width, pixels_height);
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *GPUDisplay::map_texture_buffer()
+{
+  DCHECK(!texture_buffer_state_.is_mapped);
+  DCHECK(update_state_.is_active);
+
+  if (texture_buffer_state_.is_mapped) {
+    LOG(ERROR) << "Attempt to re-map an already mapped texture buffer.";
+    return nullptr;
+  }
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    return nullptr;
+  }
+
+  half4 *mapped_rgba_pixels = do_map_texture_buffer();
+
+  if (mapped_rgba_pixels) {
+    texture_buffer_state_.is_mapped = true;
+  }
+
+  return mapped_rgba_pixels;
+}
+
+void GPUDisplay::unmap_texture_buffer()
+{
+  DCHECK(texture_buffer_state_.is_mapped);
+
+  if (!texture_buffer_state_.is_mapped) {
+    LOG(ERROR) << "Attempt to unmap non-mapped texture buffer.";
+    return;
+  }
+
+  texture_buffer_state_.is_mapped = false;
+
+  mark_texture_updated();
+  do_unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get()
+{
+  DCHECK(!texture_buffer_state_.is_mapped);
+  DCHECK(update_state_.is_active);
+
+  if (texture_buffer_state_.is_mapped) {
+    LOG(ERROR)
+        << "Attempt to use graphics interoperability mode while the texture buffer is mapped.";
+    return DeviceGraphicsInteropDestination();
+  }
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to use graphics interoperability outside of GPUDisplay update.";
+    return DeviceGraphicsInteropDestination();
+  }
+
+  /* Assume that interop will write new values to the texture. */
+  mark_texture_updated();
+
+  return do_graphics_interop_get();
+}
+
+void GPUDisplay::graphics_interop_activate()
+{
+}
+
+void GPUDisplay::graphics_interop_deactivate()
+{
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+bool GPUDisplay::draw()
+{
+  /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+   * The drawing itself is non-blocking however, for better performance and to avoid
+   * potential deadlocks due to locks held by the subclass. */
+  GPUDisplayParams params;
+  bool is_usable;
+  bool is_outdated;
+
+  {
+    thread_scoped_lock lock(mutex_);
+    params = params_;
+    is_usable = texture_state_.is_usable;
+    is_outdated = texture_state_.is_outdated;
+  }
+
+  if (is_usable) {
+    do_draw(params);
+  }
+
+  return !is_outdated;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/gpu_display.h b/intern/cycles/render/gpu_display.h
new file mode 100644
index 00000000000..cbe347895a1
--- /dev/null
+++ b/intern/cycles/render/gpu_display.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_graphics_interop.h"
+#include "util/util_half.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+/* GPUDisplay class takes care of drawing render result in a viewport. The render result is stored
+ * in a GPU-side texture, which is updated from a path tracer and drawn by an application.
+ *
+ * The base GPUDisplay does some special texture state tracking, which allows render Session to
+ * make decisions on whether reset for an updated state is possible or not. This state should only
+ * be tracked in a base class and a particular implementation should not worry about it.
+ *
+ * The subclasses should only implement the pure virtual methods, which allows them to not worry
+ * about parent method calls, which helps them to be as small and reliable as possible. */
+
+class GPUDisplayParams {
+ public:
+  /* Offset of the display within a viewport.
+   * For example, set to a lower-bottom corner of border render in Blender's viewport. */
+  int2 offset = make_int2(0, 0);
+
+  /* Full viewport size.
+   *
+   * NOTE: Is not affected by the resolution divider. */
+  int2 full_size = make_int2(0, 0);
+
+  /* Effective vieport size.
+   * In the case of border render, size of the border rectangle.
+   *
+   * NOTE: Is not affected by the resolution divider. */
+  int2 size = make_int2(0, 0);
+
+  bool modified(const GPUDisplayParams &other) const
+  {
+    return !(offset == other.offset && full_size == other.full_size && size == other.size);
+  }
+};
+
+class GPUDisplay {
+ public:
+  GPUDisplay() = default;
+  virtual ~GPUDisplay() = default;
+
+  /* Reset the display for the new state of render session. Is called whenever session is reset,
+   * which happens on changes like viewport navigation or viewport dimension change.
+   *
+   * This call will configure parameters for a changed buffer and reset the texture state. */
+  void reset(const BufferParams &buffer_params);
+
+  const GPUDisplayParams &get_params() const
+  {
+    return params_;
+  }
+
+  /* --------------------------------------------------------------------
+   * Update procedure.
+   *
+   * These calls indicates a desire of the caller to update content of the displayed texture. */
+
+  /* Returns true when update is ready. Update should be finished with update_end().
+   *
+   * If false is returned then no update is possible, and no update_end() call is needed.
+   *
+   * The texture width and height denotes an actual resolution of the underlying render result. */
+  bool update_begin(int texture_width, int texture_height);
+
+  void update_end();
+
+  /* Get currently configured texture size of the display (as configured by `update_begin()`. */
+  int2 get_texture_size() const;
+
+  /* --------------------------------------------------------------------
+   * Texture update from CPU buffer.
+   *
+   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   *
+   * Most portable implementation, which must be supported by all platforms. Might not be the most
+   * efficient one.
+   */
+
+  /* Copy buffer of rendered pixels of a given size into a given position of the texture.
+   *
+   * This function does not acquire a lock. The reason for this is is to allow use of this function
+   * for partial updates from different devices. In this case the caller will acquire the lock
+   * once, update all the slices and release
+   * the lock once. This will ensure that draw() will never use partially updated texture. */
+  void copy_pixels_to_texture(
+      const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height);
+
+  /* --------------------------------------------------------------------
+   * Texture buffer mapping.
+   *
+   * This functionality is used to update GPU-side texture content without need to maintain CPU
+   * side buffer on the caller.
+   *
+   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   *
+   * NOTE: Texture buffer can not be mapped while graphics interopeability is active. This means
+   * that `map_texture_buffer()` is not allowed between `graphics_interop_begin()` and
+   * `graphics_interop_end()` calls.
+   */
+
+  /* Map pixels memory form texture to a buffer available for write from CPU. Width and height will
+   * define a requested size of the texture to write to.
+   * Upon success a non-null pointer is returned and the texture buffer is to be unmapped.
+   * If an error happens during mapping, or if mapoping is not supported by this GPU display a
+   * null pointer is returned and the buffer is NOT to be unmapped.
+   *
+   * NOTE: Usually the implementation will rely on a GPU context of some sort, and the GPU context
+   * is often can not be bound to two threads simultaneously, and can not be released from a
+   * different thread. This means that the mapping API should be used from the single thread only,
+   */
+  half4 *map_texture_buffer();
+  void unmap_texture_buffer();
+
+  /* --------------------------------------------------------------------
+   * Graphics interoperability.
+   *
+   * A special code path which allows to update texture content directly from the GPU compute
+   * device. Complementary part of DeviceGraphicsInterop.
+   *
+   * NOTE: Graphics interoperability can not be used while the texture buffer is mapped. This means
+   * that `graphics_interop_get()` is not allowed between `map_texture_buffer()` and
+   * `unmap_texture_buffer()` calls. */
+
+  /* Get GPUDisplay graphics interoperability information which acts as a destination for the
+   * device API. */
+  DeviceGraphicsInteropDestination graphics_interop_get();
+
+  /* (De)activate GPU display for graphics interoperability outside of regular display udpate
+   * routines. */
+  virtual void graphics_interop_activate();
+  virtual void graphics_interop_deactivate();
+
+  /* --------------------------------------------------------------------
+   * Drawing.
+   */
+
+  /* Clear the texture by filling it with all zeroes.
+   *
+   * This call might happen in parallel with draw, but can never happen in parallel with the
+   * update.
+   *
+   * The actual zero-ing can be deferred to a later moment. What is important is that after clear
+   * and before pixels update the drawing texture will be fully empty, and that partial update
+   * after clear will write new pixel values for an updating area, leaving everything else zeroed.
+   *
+   * If the GPU display supports graphics interoperability then the zeroing the display is to be
+   * delegated to the device via the `DeviceGraphicsInteropDestination`. */
+  virtual void clear() = 0;
+
+  /* Draw the current state of the texture.
+   *
+   * Returns true if this call did draw an updated state of the texture. */
+  bool draw();
+
+ protected:
+  /* Implementation-specific calls which subclasses are to implement.
+   * These `do_foo()` method corresponds to their `foo()` calls, but they are purely virtual to
+   * simplify their particular implementation. */
+  virtual bool do_update_begin(const GPUDisplayParams &params,
+                               int texture_width,
+                               int texture_height) = 0;
+  virtual void do_update_end() = 0;
+
+  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+                                         int texture_x,
+                                         int texture_y,
+                                         int pixels_width,
+                                         int pixels_height) = 0;
+
+  virtual half4 *do_map_texture_buffer() = 0;
+  virtual void do_unmap_texture_buffer() = 0;
+
+  /* Note that this might be called in parallel to do_update_begin() and do_update_end(),
+   * the subclass is responsible for appropriate mutex locks to avoid multiple threads
+   * editing and drawing the texture at the same time. */
+  virtual void do_draw(const GPUDisplayParams &params) = 0;
+
+  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() = 0;
+
+ private:
+  thread_mutex mutex_;
+  GPUDisplayParams params_;
+
+  /* Mark texture as its content has been updated.
+   * Used from places which knows that the texture content has been brough up-to-date, so that the
+   * drawing knows whether it can be performed, and whether drawing happenned with an up-to-date
+   * texture state. */
+  void mark_texture_updated();
+
+  /* State of the update process. */
+  struct {
+    /* True when update is in process, indicated by `update_begin()` / `update_end()`. */
+    bool is_active = false;
+  } update_state_;
+
+  /* State of the texture, which is needed for an integration with render session and interactive
+   * updates and navigation. */
+  struct {
+    /* Denotes whether possibly existing state of GPU side texture is still usable.
+     * It will not be usable in cases like render border did change (in this case we don't want
+     * previous texture to be rendered at all).
+     *
+     * However, if only navigation or object in scene did change, then the outdated state of the
+     * texture is still usable for draw, preventing display viewport flickering on navigation and
+     * object modifications. */
+    bool is_usable = false;
+
+    /* Texture is considered outdated after `reset()` until the next call of
+     * `copy_pixels_to_texture()`. */
+    bool is_outdated = true;
+
+    /* Texture size in pixels. */
+    int2 size = make_int2(0, 0);
+  } texture_state_;
+
+  /* State of the texture buffer. Is tracked to perform sanity checks. */
+  struct {
+    /* True when the texture buffer is mapped with `map_texture_buffer()`. */
+    bool is_mapped = false;
+  } texture_buffer_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 5102b182593..3584754fad1 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -224,10 +224,6 @@ class ShaderNode : public Node {
   {
     return false;
   }
-  virtual bool has_raytrace()
-  {
-    return false;
-  }
   vector<ShaderInput *> inputs;
   vector<ShaderOutput *> outputs;
 
@@ -242,22 +238,13 @@ class ShaderNode : public Node {
    * that those functions are for selective compilation only?
    */
 
-  /* Nodes are split into several groups, group of level 0 contains
-   * nodes which are most commonly used, further levels are extension
-   * of previous one and includes less commonly used nodes.
-   */
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_0;
-  }
-
   /* Node feature are used to disable huge nodes inside the group,
    * so it's possible to disable huge nodes inside of the required
    * nodes group.
    */
   virtual int get_feature()
   {
-    return bump == SHADER_BUMP_NONE ? 0 : NODE_FEATURE_BUMP;
+    return bump == SHADER_BUMP_NONE ? 0 : KERNEL_FEATURE_NODE_BUMP;
   }
 
   /* Get closure ID to which the node compiles into. */
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index d8749cec9fa..d74d14242bb 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -53,6 +53,8 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
 
   SOCKET_INT(ao_bounces, "AO Bounces", 0);
+  SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
+  SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
 
   SOCKET_INT(volume_max_steps, "Volume Max Steps", 1024);
   SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
@@ -66,33 +68,39 @@ NODE_DEFINE(Integrator)
   SOCKET_BOOLEAN(motion_blur, "Motion Blur", false);
 
   SOCKET_INT(aa_samples, "AA Samples", 0);
-  SOCKET_INT(diffuse_samples, "Diffuse Samples", 1);
-  SOCKET_INT(glossy_samples, "Glossy Samples", 1);
-  SOCKET_INT(transmission_samples, "Transmission Samples", 1);
-  SOCKET_INT(ao_samples, "AO Samples", 1);
-  SOCKET_INT(mesh_light_samples, "Mesh Light Samples", 1);
-  SOCKET_INT(subsurface_samples, "Subsurface Samples", 1);
-  SOCKET_INT(volume_samples, "Volume Samples", 1);
   SOCKET_INT(start_sample, "Start Sample", 0);
 
+  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
   SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
   SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
 
-  SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
-  SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
   SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
 
-  static NodeEnum method_enum;
-  method_enum.insert("path", PATH);
-  method_enum.insert("branched_path", BRANCHED_PATH);
-  SOCKET_ENUM(method, "Method", method_enum, PATH);
-
   static NodeEnum sampling_pattern_enum;
   sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL);
-  sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ);
   sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ);
   SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL);
 
+  static NodeEnum denoiser_type_enum;
+  denoiser_type_enum.insert("optix", DENOISER_OPTIX);
+  denoiser_type_enum.insert("openimagedenoise", DENOISER_OPENIMAGEDENOISE);
+
+  static NodeEnum denoiser_prefilter_enum;
+  denoiser_prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+  denoiser_prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+  denoiser_prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+
+  /* Default to accurate denoising with OpenImageDenoise. For interactive viewport
+   * it's best use OptiX and disable the normal pass since it does not always have
+   * the desired effect for that denoiser. */
+  SOCKET_BOOLEAN(use_denoise, "Use Denoiser", false);
+  SOCKET_ENUM(denoiser_type, "Denoiser Type", denoiser_type_enum, DENOISER_OPENIMAGEDENOISE);
+  SOCKET_INT(denoise_start_sample, "Start Sample to Denoise", 0);
+  SOCKET_BOOLEAN(use_denoise_pass_albedo, "Use Albedo Pass for Denoiser", true);
+  SOCKET_BOOLEAN(use_denoise_pass_normal, "Use Normal Pass for Denoiser", true);
+  SOCKET_ENUM(
+      denoiser_prefilter, "Denoiser Type", denoiser_prefilter_enum, DENOISER_PREFILTER_ACCURATE);
+
   return type;
 }
 
@@ -115,13 +123,20 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
     }
   });
 
-  const bool need_update_lut = ao_samples_is_modified() || diffuse_samples_is_modified() ||
-                               glossy_samples_is_modified() || max_bounce_is_modified() ||
-                               max_transmission_bounce_is_modified() ||
-                               mesh_light_samples_is_modified() || method_is_modified() ||
-                               sampling_pattern_is_modified() ||
-                               subsurface_samples_is_modified() ||
-                               transmission_samples_is_modified() || volume_samples_is_modified();
+  KernelIntegrator *kintegrator = &dscene->data.integrator;
+
+  /* Adaptive sampling requires PMJ samples.
+   *
+   * This also makes detection of sampling pattern a bit more involved: can not rely on the changed
+   * state of socket, since its value might be different from the effective value used here. So
+   * instead compare with previous value in the KernelIntegrator. Only do it if the device was
+   * updated once (in which case the `sample_pattern_lut` will be allocated to a non-zero size). */
+  const SamplingPattern new_sampling_pattern = (use_adaptive_sampling) ? SAMPLING_PATTERN_PMJ :
+                                                                         sampling_pattern;
+
+  const bool need_update_lut = max_bounce_is_modified() || max_transmission_bounce_is_modified() ||
+                               dscene->sample_pattern_lut.size() == 0 ||
+                               kintegrator->sampling_pattern != new_sampling_pattern;
 
   if (need_update_lut) {
     dscene->sample_pattern_lut.tag_realloc();
@@ -129,8 +144,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   device_free(device, dscene);
 
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
-
   /* integrator parameters */
   kintegrator->min_bounce = min_bounce + 1;
   kintegrator->max_bounce = max_bounce + 1;
@@ -143,12 +156,9 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
   kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
 
-  if (ao_bounces == 0) {
-    kintegrator->ao_bounces = INT_MAX;
-  }
-  else {
-    kintegrator->ao_bounces = ao_bounces - 1;
-  }
+  kintegrator->ao_bounces = ao_bounces;
+  kintegrator->ao_bounces_distance = ao_distance;
+  kintegrator->ao_bounces_factor = ao_factor;
 
   /* Transparent Shadows
    * We only need to enable transparent shadows, if we actually have
@@ -171,10 +181,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->caustics_refractive = caustics_refractive;
   kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy;
 
-  kintegrator->seed = hash_uint2(seed, 0);
-
-  kintegrator->use_ambient_occlusion = ((Pass::contains(scene->passes, PASS_AO)) ||
-                                        dscene->data.background.ao_factor != 0.0f);
+  kintegrator->seed = seed;
 
   kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f) ? FLT_MAX :
                                                                      sample_clamp_direct * 3.0f;
@@ -182,51 +189,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
                                            FLT_MAX :
                                            sample_clamp_indirect * 3.0f;
 
-  kintegrator->branched = (method == BRANCHED_PATH) && device->info.has_branched_path;
-  kintegrator->volume_decoupled = device->info.has_volume_decoupled;
-  kintegrator->diffuse_samples = diffuse_samples;
-  kintegrator->glossy_samples = glossy_samples;
-  kintegrator->transmission_samples = transmission_samples;
-  kintegrator->ao_samples = ao_samples;
-  kintegrator->mesh_light_samples = mesh_light_samples;
-  kintegrator->subsurface_samples = subsurface_samples;
-  kintegrator->volume_samples = volume_samples;
-  kintegrator->start_sample = start_sample;
-
-  if (kintegrator->branched) {
-    kintegrator->sample_all_lights_direct = sample_all_lights_direct;
-    kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
-  }
-  else {
-    kintegrator->sample_all_lights_direct = false;
-    kintegrator->sample_all_lights_indirect = false;
-  }
-
-  kintegrator->sampling_pattern = sampling_pattern;
-  kintegrator->aa_samples = aa_samples;
-  if (aa_samples > 0 && adaptive_min_samples == 0) {
-    kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
-    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
-            << kintegrator->adaptive_min_samples;
-  }
-  else {
-    kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
-  }
-
-  kintegrator->adaptive_step = 4;
-  kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample;
-
-  /* Adaptive step must be a power of two for bitwise operations to work. */
-  assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0);
-
-  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
-    kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
-    VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
-            << kintegrator->adaptive_threshold;
-  }
-  else {
-    kintegrator->adaptive_threshold = adaptive_threshold;
-  }
+  kintegrator->sampling_pattern = new_sampling_pattern;
 
   if (light_sampling_threshold > 0.0f) {
     kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
@@ -236,29 +199,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   }
 
   /* sobol directions table */
-  int max_samples = 1;
-
-  if (kintegrator->branched) {
-    foreach (Light *light, scene->lights)
-      max_samples = max(max_samples, light->get_samples());
-
-    max_samples = max(max_samples,
-                      max(diffuse_samples, max(glossy_samples, transmission_samples)));
-    max_samples = max(max_samples, max(ao_samples, max(mesh_light_samples, subsurface_samples)));
-    max_samples = max(max_samples, volume_samples);
-  }
-
-  uint total_bounces = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
-                       max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
-
-  max_samples *= total_bounces;
+  int max_samples = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
+                    max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
 
   int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM;
   dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS);
 
   if (need_update_lut) {
-    if (sampling_pattern == SAMPLING_PATTERN_SOBOL) {
-      uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
+    if (kintegrator->sampling_pattern == SAMPLING_PATTERN_SOBOL) {
+      uint *directions = (uint *)dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
 
       sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
 
@@ -276,10 +225,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
             function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j));
       }
       pool.wait_work();
+
       dscene->sample_pattern_lut.copy_to_device();
     }
   }
 
+  kintegrator->has_shadow_catcher = scene->has_shadow_catcher();
+
   dscene->sample_pattern_lut.clear_modified();
   clear_modified();
 }
@@ -295,17 +247,12 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
     tag_modified();
   }
 
-  if (flag & (AO_PASS_MODIFIED | BACKGROUND_AO_MODIFIED)) {
+  if (flag & AO_PASS_MODIFIED) {
     /* tag only the ao_bounces socket as modified so we avoid updating sample_pattern_lut
      * unnecessarily */
     tag_ao_bounces_modified();
   }
 
-  if ((flag & LIGHT_SAMPLES_MODIFIED) && (method == BRANCHED_PATH)) {
-    /* the number of light samples may affect the size of the sample_pattern_lut */
-    tag_sampling_pattern_modified();
-  }
-
   if (filter_glossy_is_modified()) {
     foreach (Shader *shader, scene->shaders) {
       if (shader->has_integrator_dependency) {
@@ -321,4 +268,65 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
   }
 }
 
+AdaptiveSampling Integrator::get_adaptive_sampling() const
+{
+  AdaptiveSampling adaptive_sampling;
+
+  adaptive_sampling.use = use_adaptive_sampling;
+
+  if (!adaptive_sampling.use) {
+    return adaptive_sampling;
+  }
+
+  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+    adaptive_sampling.threshold = max(0.001f, 1.0f / (float)aa_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic threshold = " << adaptive_sampling.threshold;
+  }
+  else {
+    adaptive_sampling.threshold = adaptive_threshold;
+  }
+
+  if (adaptive_sampling.threshold > 0 && adaptive_min_samples == 0) {
+    /* Threshold 0.1 -> 32, 0.01 -> 64, 0.001 -> 128.
+     * This is highly scene dependent, we make a guess that seemed to work well
+     * in various test scenes. */
+    const int min_samples = (int)ceilf(16.0f / powf(adaptive_sampling.threshold, 0.3f));
+    adaptive_sampling.min_samples = max(4, min_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+            << adaptive_sampling.min_samples;
+  }
+  else {
+    adaptive_sampling.min_samples = max(4, adaptive_min_samples);
+  }
+
+  /* Arbitrary factor that makes the threshold more similar to what is was before,
+   * and gives arguably more intuitive values. */
+  adaptive_sampling.threshold *= 5.0f;
+
+  adaptive_sampling.adaptive_step = 16;
+
+  DCHECK(is_power_of_two(adaptive_sampling.adaptive_step))
+      << "Adaptive step must be a power of two for bitwise operations to work";
+
+  return adaptive_sampling;
+}
+
+DenoiseParams Integrator::get_denoise_params() const
+{
+  DenoiseParams denoise_params;
+
+  denoise_params.use = use_denoise;
+
+  denoise_params.type = denoiser_type;
+
+  denoise_params.start_sample = denoise_start_sample;
+
+  denoise_params.use_pass_albedo = use_denoise_pass_albedo;
+  denoise_params.use_pass_normal = use_denoise_pass_normal;
+
+  denoise_params.prefilter = denoiser_prefilter;
+
+  return denoise_params;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 4eeeda92d41..32e108d62ca 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -19,7 +19,9 @@
 
 #include "kernel/kernel_types.h"
 
+#include "device/device_denoise.h" /* For the paramaters and type enum. */
 #include "graph/node.h"
+#include "integrator/adaptive_sampling.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,6 +45,8 @@ class Integrator : public Node {
   NODE_SOCKET_API(int, transparent_max_bounce)
 
   NODE_SOCKET_API(int, ao_bounces)
+  NODE_SOCKET_API(float, ao_factor)
+  NODE_SOCKET_API(float, ao_distance)
 
   NODE_SOCKET_API(int, volume_max_steps)
   NODE_SOCKET_API(float, volume_step_rate)
@@ -62,37 +66,26 @@ class Integrator : public Node {
   static const int MAX_SAMPLES = (1 << 24);
 
   NODE_SOCKET_API(int, aa_samples)
-  NODE_SOCKET_API(int, diffuse_samples)
-  NODE_SOCKET_API(int, glossy_samples)
-  NODE_SOCKET_API(int, transmission_samples)
-  NODE_SOCKET_API(int, ao_samples)
-  NODE_SOCKET_API(int, mesh_light_samples)
-  NODE_SOCKET_API(int, subsurface_samples)
-  NODE_SOCKET_API(int, volume_samples)
   NODE_SOCKET_API(int, start_sample)
 
-  NODE_SOCKET_API(bool, sample_all_lights_direct)
-  NODE_SOCKET_API(bool, sample_all_lights_indirect)
   NODE_SOCKET_API(float, light_sampling_threshold)
 
+  NODE_SOCKET_API(bool, use_adaptive_sampling)
   NODE_SOCKET_API(int, adaptive_min_samples)
   NODE_SOCKET_API(float, adaptive_threshold)
 
-  enum Method {
-    BRANCHED_PATH = 0,
-    PATH = 1,
-
-    NUM_METHODS,
-  };
-
-  NODE_SOCKET_API(Method, method)
-
   NODE_SOCKET_API(SamplingPattern, sampling_pattern)
 
+  NODE_SOCKET_API(bool, use_denoise);
+  NODE_SOCKET_API(DenoiserType, denoiser_type);
+  NODE_SOCKET_API(int, denoise_start_sample);
+  NODE_SOCKET_API(bool, use_denoise_pass_albedo);
+  NODE_SOCKET_API(bool, use_denoise_pass_normal);
+  NODE_SOCKET_API(DenoiserPrefilter, denoiser_prefilter);
+
   enum : uint32_t {
     AO_PASS_MODIFIED = (1 << 0),
-    BACKGROUND_AO_MODIFIED = (1 << 1),
-    LIGHT_SAMPLES_MODIFIED = (1 << 2),
+    OBJECT_MANAGER = (1 << 1),
 
     /* tag everything in the manager for an update */
     UPDATE_ALL = ~0u,
@@ -107,6 +100,9 @@ class Integrator : public Node {
   void device_free(Device *device, DeviceScene *dscene, bool force_free = false);
 
   void tag_update(Scene *scene, uint32_t flag);
+
+  AdaptiveSampling get_adaptive_sampling() const;
+  DenoiseParams get_denoise_params() const;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/jitter.cpp b/intern/cycles/render/jitter.cpp
index fc47b0e8f0a..e31f8abd446 100644
--- a/intern/cycles/render/jitter.cpp
+++ b/intern/cycles/render/jitter.cpp
@@ -242,12 +242,6 @@ class PMJ02_Generator : public PMJ_Generator {
 
 static void shuffle(float2 points[], int size, int rng_seed)
 {
-  /* Offset samples by 1.0 for faster scrambling in kernel_random.h */
-  for (int i = 0; i < size; ++i) {
-    points[i].x += 1.0f;
-    points[i].y += 1.0f;
-  }
-
   if (rng_seed == 0) {
     return;
   }
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 15aa4e047b5..ae1150fc07b 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include "render/light.h"
 #include "device/device.h"
+
 #include "render/background.h"
 #include "render/film.h"
 #include "render/graph.h"
 #include "render/integrator.h"
+#include "render/light.h"
 #include "render/mesh.h"
 #include "render/nodes.h"
 #include "render/object.h"
@@ -27,6 +28,8 @@
 #include "render/shader.h"
 #include "render/stats.h"
 
+#include "integrator/shader_eval.h"
+
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_logging.h"
@@ -43,63 +46,49 @@ static void shade_background_pixels(Device *device,
                                     vector<float3> &pixels,
                                     Progress &progress)
 {
-  /* create input */
-  device_vector<uint4> d_input(device, "background_input", MEM_READ_ONLY);
-  device_vector<float4> d_output(device, "background_output", MEM_READ_WRITE);
-
-  uint4 *d_input_data = d_input.alloc(width * height);
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < width; x++) {
-      float u = (x + 0.5f) / width;
-      float v = (y + 0.5f) / height;
-
-      uint4 in = make_uint4(__float_as_int(u), __float_as_int(v), 0, 0);
-      d_input_data[x + y * width] = in;
-    }
-  }
-
-  /* compute on device */
-  d_output.alloc(width * height);
-  d_output.zero_to_device();
-  d_input.copy_to_device();
-
+  /* Needs to be up to data for attribute access. */
   device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-  DeviceTask main_task(DeviceTask::SHADER);
-  main_task.shader_input = d_input.device_pointer;
-  main_task.shader_output = d_output.device_pointer;
-  main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
-  main_task.shader_x = 0;
-  main_task.shader_w = width * height;
-  main_task.num_samples = 1;
-  main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
-  /* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
-  list<DeviceTask> split_tasks;
-  main_task.split(split_tasks, 1, 128 * 128);
-
-  foreach (DeviceTask &task, split_tasks) {
-    device->task_add(task);
-    device->task_wait();
-    d_output.copy_from_device(task.shader_x, 1, task.shader_w);
-  }
-
-  d_input.free();
-
-  float4 *d_output_data = d_output.data();
-
-  pixels.resize(width * height);
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < width; x++) {
-      pixels[y * width + x].x = d_output_data[y * width + x].x;
-      pixels[y * width + x].y = d_output_data[y * width + x].y;
-      pixels[y * width + x].z = d_output_data[y * width + x].z;
-    }
-  }
+  const int size = width * height;
+  pixels.resize(size);
+
+  /* Evaluate shader on device. */
+  ShaderEval shader_eval(device, progress);
+  shader_eval.eval(
+      SHADER_EVAL_BACKGROUND,
+      size,
+      [&](device_vector<KernelShaderEvalInput> &d_input) {
+        /* Fill coordinates for shading. */
+        KernelShaderEvalInput *d_input_data = d_input.data();
+
+        for (int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            float u = (x + 0.5f) / width;
+            float v = (y + 0.5f) / height;
+
+            KernelShaderEvalInput in;
+            in.object = OBJECT_NONE;
+            in.prim = PRIM_NONE;
+            in.u = u;
+            in.v = v;
+            d_input_data[x + y * width] = in;
+          }
+        }
 
-  d_output.free();
+        return size;
+      },
+      [&](device_vector<float4> &d_output) {
+        /* Copy output to pixel buffer. */
+        float4 *d_output_data = d_output.data();
+
+        for (int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            pixels[y * width + x].x = d_output_data[y * width + x].x;
+            pixels[y * width + x].y = d_output_data[y * width + x].y;
+            pixels[y * width + x].z = d_output_data[y * width + x].z;
+          }
+        }
+      });
 }
 
 /* Light */
@@ -140,15 +129,16 @@ NODE_DEFINE(Light)
 
   SOCKET_BOOLEAN(cast_shadow, "Cast Shadow", true);
   SOCKET_BOOLEAN(use_mis, "Use Mis", false);
+  SOCKET_BOOLEAN(use_camera, "Use Camera", true);
   SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true);
   SOCKET_BOOLEAN(use_glossy, "Use Glossy", true);
   SOCKET_BOOLEAN(use_transmission, "Use Transmission", true);
   SOCKET_BOOLEAN(use_scatter, "Use Scatter", true);
 
-  SOCKET_INT(samples, "Samples", 1);
   SOCKET_INT(max_bounces, "Max Bounces", 1024);
   SOCKET_UINT(random_id, "Random ID", 0);
 
+  SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", true);
   SOCKET_BOOLEAN(is_portal, "Is Portal", false);
   SOCKET_BOOLEAN(is_enabled, "Is Enabled", true);
 
@@ -166,10 +156,6 @@ void Light::tag_update(Scene *scene)
 {
   if (is_modified()) {
     scene->light_manager->tag_update(scene, LightManager::LIGHT_MODIFIED);
-
-    if (samples_is_modified()) {
-      scene->integrator->tag_update(scene, Integrator::LIGHT_SAMPLES_MODIFIED);
-    }
   }
 }
 
@@ -193,7 +179,6 @@ LightManager::LightManager()
 {
   update_flags = UPDATE_ALL;
   need_update_background = true;
-  use_light_visibility = false;
   last_background_enabled = false;
   last_background_resolution = 0;
 }
@@ -357,21 +342,23 @@ void LightManager::device_update_distribution(Device *,
     int object_id = j;
     int shader_flag = 0;
 
+    if (!(object->get_visibility() & PATH_RAY_CAMERA)) {
+      shader_flag |= SHADER_EXCLUDE_CAMERA;
+    }
     if (!(object->get_visibility() & PATH_RAY_DIFFUSE)) {
       shader_flag |= SHADER_EXCLUDE_DIFFUSE;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_GLOSSY)) {
       shader_flag |= SHADER_EXCLUDE_GLOSSY;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_TRANSMIT)) {
       shader_flag |= SHADER_EXCLUDE_TRANSMIT;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_VOLUME_SCATTER)) {
       shader_flag |= SHADER_EXCLUDE_SCATTER;
-      use_light_visibility = true;
+    }
+    if (!(object->get_is_shadow_catcher())) {
+      shader_flag |= SHADER_EXCLUDE_SHADOW_CATCHER;
     }
 
     size_t mesh_num_triangles = mesh->num_triangles();
@@ -496,10 +483,10 @@ void LightManager::device_update_distribution(Device *,
     kfilm->pass_shadow_scale = 1.0f;
 
     if (kintegrator->pdf_triangles != 0.0f)
-      kfilm->pass_shadow_scale *= 0.5f;
+      kfilm->pass_shadow_scale /= 0.5f;
 
     if (num_background_lights < num_lights)
-      kfilm->pass_shadow_scale *= (float)(num_lights - num_background_lights) / (float)num_lights;
+      kfilm->pass_shadow_scale /= (float)(num_lights - num_background_lights) / (float)num_lights;
 
     /* CDF */
     dscene->light_distribution.copy_to_device();
@@ -766,25 +753,26 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
     if (!light->cast_shadow)
       shader_id &= ~SHADER_CAST_SHADOW;
 
+    if (!light->use_camera) {
+      shader_id |= SHADER_EXCLUDE_CAMERA;
+    }
     if (!light->use_diffuse) {
       shader_id |= SHADER_EXCLUDE_DIFFUSE;
-      use_light_visibility = true;
     }
     if (!light->use_glossy) {
       shader_id |= SHADER_EXCLUDE_GLOSSY;
-      use_light_visibility = true;
     }
     if (!light->use_transmission) {
       shader_id |= SHADER_EXCLUDE_TRANSMIT;
-      use_light_visibility = true;
     }
     if (!light->use_scatter) {
       shader_id |= SHADER_EXCLUDE_SCATTER;
-      use_light_visibility = true;
+    }
+    if (!light->is_shadow_catcher) {
+      shader_id |= SHADER_EXCLUDE_SHADOW_CATCHER;
     }
 
     klights[light_index].type = light->light_type;
-    klights[light_index].samples = light->samples;
     klights[light_index].strength[0] = light->strength.x;
     klights[light_index].strength[1] = light->strength.y;
     klights[light_index].strength[2] = light->strength.z;
@@ -836,19 +824,15 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
 
       if (!(visibility & PATH_RAY_DIFFUSE)) {
         shader_id |= SHADER_EXCLUDE_DIFFUSE;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_GLOSSY)) {
         shader_id |= SHADER_EXCLUDE_GLOSSY;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_TRANSMIT)) {
         shader_id |= SHADER_EXCLUDE_TRANSMIT;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_VOLUME_SCATTER)) {
         shader_id |= SHADER_EXCLUDE_SCATTER;
-        use_light_visibility = true;
       }
     }
     else if (light->light_type == LIGHT_AREA) {
@@ -998,8 +982,6 @@ void LightManager::device_update(Device *device,
 
   device_free(device, dscene, need_update_background);
 
-  use_light_visibility = false;
-
   device_update_points(device, dscene, scene);
   if (progress.get_cancel())
     return;
@@ -1018,8 +1000,6 @@ void LightManager::device_update(Device *device,
   if (progress.get_cancel())
     return;
 
-  scene->film->set_use_light_visibility(use_light_visibility);
-
   update_flags = UPDATE_NONE;
   need_update_background = false;
 }
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index fbd709125ff..7f86237c8b3 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -69,16 +69,17 @@ class Light : public Node {
 
   NODE_SOCKET_API(bool, cast_shadow)
   NODE_SOCKET_API(bool, use_mis)
+  NODE_SOCKET_API(bool, use_camera)
   NODE_SOCKET_API(bool, use_diffuse)
   NODE_SOCKET_API(bool, use_glossy)
   NODE_SOCKET_API(bool, use_transmission)
   NODE_SOCKET_API(bool, use_scatter)
 
+  NODE_SOCKET_API(bool, is_shadow_catcher)
   NODE_SOCKET_API(bool, is_portal)
   NODE_SOCKET_API(bool, is_enabled)
 
   NODE_SOCKET_API(Shader *, shader)
-  NODE_SOCKET_API(int, samples)
   NODE_SOCKET_API(int, max_bounces)
   NODE_SOCKET_API(uint, random_id)
 
@@ -108,8 +109,6 @@ class LightManager {
     UPDATE_NONE = 0u,
   };
 
-  bool use_light_visibility;
-
   /* Need to update background (including multiple importance map) */
   bool need_update_background;
 
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index b39d81023d9..c00c4c24211 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -16,6 +16,8 @@
 
 #include "device/device.h"
 
+#include "integrator/shader_eval.h"
+
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
@@ -43,40 +45,28 @@ static float3 compute_face_normal(const Mesh::Triangle &t, float3 *verts)
   return norm / normlen;
 }
 
-bool GeometryManager::displace(
-    Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+/* Fill in coordinates for mesh displacement shader evaluation on device. */
+static int fill_shader_input(const Scene *scene,
+                             const Mesh *mesh,
+                             const int object_index,
+                             device_vector<KernelShaderEvalInput> &d_input)
 {
-  /* verify if we have a displacement shader */
-  if (!mesh->has_true_displacement()) {
-    return false;
-  }
-
-  string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
-  progress.set_status("Updating Mesh", msg);
+  int d_input_size = 0;
+  KernelShaderEvalInput *d_input_data = d_input.data();
 
-  /* find object index. todo: is arbitrary */
-  size_t object_index = OBJECT_NONE;
+  const array<int> &mesh_shaders = mesh->get_shader();
+  const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+  const array<float3> &mesh_verts = mesh->get_verts();
 
-  for (size_t i = 0; i < scene->objects.size(); i++) {
-    if (scene->objects[i]->get_geometry() == mesh) {
-      object_index = i;
-      break;
-    }
-  }
-
-  /* setup input for device task */
-  const size_t num_verts = mesh->verts.size();
+  const int num_verts = mesh_verts.size();
   vector<bool> done(num_verts, false);
-  device_vector<uint4> d_input(device, "displace_input", MEM_READ_ONLY);
-  uint4 *d_input_data = d_input.alloc(num_verts);
-  size_t d_input_size = 0;
 
-  size_t num_triangles = mesh->num_triangles();
-  for (size_t i = 0; i < num_triangles; i++) {
+  int num_triangles = mesh->num_triangles();
+  for (int i = 0; i < num_triangles; i++) {
     Mesh::Triangle t = mesh->get_triangle(i);
-    int shader_index = mesh->shader[i];
-    Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-                         static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+    int shader_index = mesh_shaders[i];
+    Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+                         static_cast<Shader *>(mesh_used_shaders[shader_index]) :
                          scene->default_surface;
 
     if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -110,57 +100,41 @@ bool GeometryManager::displace(
       }
 
       /* back */
-      uint4 in = make_uint4(object, prim, __float_as_int(u), __float_as_int(v));
+      KernelShaderEvalInput in;
+      in.object = object;
+      in.prim = prim;
+      in.u = u;
+      in.v = v;
       d_input_data[d_input_size++] = in;
     }
   }
 
-  if (d_input_size == 0)
-    return false;
-
-  /* run device task */
-  device_vector<float4> d_output(device, "displace_output", MEM_READ_WRITE);
-  d_output.alloc(d_input_size);
-  d_output.zero_to_device();
-  d_input.copy_to_device();
-
-  /* needs to be up to data for attribute access */
-  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
-
-  DeviceTask task(DeviceTask::SHADER);
-  task.shader_input = d_input.device_pointer;
-  task.shader_output = d_output.device_pointer;
-  task.shader_eval_type = SHADER_EVAL_DISPLACE;
-  task.shader_x = 0;
-  task.shader_w = d_output.size();
-  task.num_samples = 1;
-  task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
-  device->task_add(task);
-  device->task_wait();
-
-  if (progress.get_cancel()) {
-    d_input.free();
-    d_output.free();
-    return false;
-  }
+  return d_input_size;
+}
 
-  d_output.copy_from_device(0, 1, d_output.size());
-  d_input.free();
+/* Read back mesh displacement shader output. */
+static void read_shader_output(const Scene *scene,
+                               Mesh *mesh,
+                               const device_vector<float4> &d_output)
+{
+  const array<int> &mesh_shaders = mesh->get_shader();
+  const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+  array<float3> &mesh_verts = mesh->get_verts();
 
-  /* read result */
-  done.clear();
-  done.resize(num_verts, false);
-  int k = 0;
+  const int num_verts = mesh_verts.size();
+  const int num_motion_steps = mesh->get_motion_steps();
+  vector<bool> done(num_verts, false);
 
-  float4 *offset = d_output.data();
+  const float4 *d_output_data = d_output.data();
+  int d_output_index = 0;
 
   Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-  for (size_t i = 0; i < num_triangles; i++) {
+  int num_triangles = mesh->num_triangles();
+  for (int i = 0; i < num_triangles; i++) {
     Mesh::Triangle t = mesh->get_triangle(i);
-    int shader_index = mesh->shader[i];
-    Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-                         static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+    int shader_index = mesh_shaders[i];
+    Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+                         static_cast<Shader *>(mesh_used_shaders[shader_index]) :
                          scene->default_surface;
 
     if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -170,12 +144,12 @@ bool GeometryManager::displace(
     for (int j = 0; j < 3; j++) {
       if (!done[t.v[j]]) {
         done[t.v[j]] = true;
-        float3 off = float4_to_float3(offset[k++]);
+        float3 off = float4_to_float3(d_output_data[d_output_index++]);
         /* Avoid illegal vertex coordinates. */
         off = ensure_finite3(off);
-        mesh->verts[t.v[j]] += off;
+        mesh_verts[t.v[j]] += off;
         if (attr_mP != NULL) {
-          for (int step = 0; step < mesh->motion_steps - 1; step++) {
+          for (int step = 0; step < num_motion_steps - 1; step++) {
             float3 *mP = attr_mP->data_float3() + step * num_verts;
             mP[t.v[j]] += off;
           }
@@ -183,8 +157,47 @@ bool GeometryManager::displace(
       }
     }
   }
+}
 
-  d_output.free();
+bool GeometryManager::displace(
+    Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+{
+  /* verify if we have a displacement shader */
+  if (!mesh->has_true_displacement()) {
+    return false;
+  }
+
+  const size_t num_verts = mesh->verts.size();
+  const size_t num_triangles = mesh->num_triangles();
+
+  if (num_triangles == 0) {
+    return false;
+  }
+
+  string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
+  progress.set_status("Updating Mesh", msg);
+
+  /* find object index. todo: is arbitrary */
+  size_t object_index = OBJECT_NONE;
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    if (scene->objects[i]->get_geometry() == mesh) {
+      object_index = i;
+      break;
+    }
+  }
+
+  /* Needs to be up to data for attribute access. */
+  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+
+  /* Evaluate shader on device. */
+  ShaderEval shader_eval(device, progress);
+  if (!shader_eval.eval(SHADER_EVAL_DISPLACE,
+                        num_verts,
+                        function_bind(&fill_shader_input, scene, mesh, object_index, _1),
+                        function_bind(&read_shader_output, scene, mesh, _1))) {
+    return false;
+  }
 
   /* stitch */
   unordered_set<int> stitch_keys;
@@ -297,8 +310,7 @@ bool GeometryManager::displace(
     }
 
     /* normalize vertex normals */
-    done.clear();
-    done.resize(num_verts, false);
+    vector<bool> done(num_verts, false);
 
     for (size_t i = 0; i < num_triangles; i++) {
       if (tri_has_true_disp[i]) {
@@ -368,8 +380,7 @@ bool GeometryManager::displace(
         }
 
         /* normalize vertex normals */
-        done.clear();
-        done.resize(num_verts, false);
+        vector<bool> done(num_verts, false);
 
         for (size_t i = 0; i < num_triangles; i++) {
           if (tri_has_true_disp[i]) {
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 795166bcf4c..5303d55242e 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2736,18 +2736,21 @@ NODE_DEFINE(PrincipledBsdfNode)
       distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
 
   static NodeEnum subsurface_method_enum;
-  subsurface_method_enum.insert("burley", CLOSURE_BSSRDF_PRINCIPLED_ID);
-  subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+  subsurface_method_enum.insert("random_walk_fixed_radius",
+                                CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+  subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
   SOCKET_ENUM(subsurface_method,
               "Subsurface Method",
               subsurface_method_enum,
-              CLOSURE_BSSRDF_PRINCIPLED_ID);
+              CLOSURE_BSSRDF_RANDOM_WALK_ID);
 
   SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f));
   SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f));
   SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f);
   SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
   SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
+  SOCKET_IN_FLOAT(subsurface_ior, "Subsurface IOR", 1.4f);
+  SOCKET_IN_FLOAT(subsurface_anisotropy, "Subsurface Anisotropy", 0.0f);
   SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
   SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
   SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
@@ -2857,6 +2860,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
                                  ShaderInput *p_metallic,
                                  ShaderInput *p_subsurface,
                                  ShaderInput *p_subsurface_radius,
+                                 ShaderInput *p_subsurface_ior,
+                                 ShaderInput *p_subsurface_anisotropy,
                                  ShaderInput *p_specular,
                                  ShaderInput *p_roughness,
                                  ShaderInput *p_specular_tint,
@@ -2896,6 +2901,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
   int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
   int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
   int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
+  int subsurface_ior_offset = compiler.stack_assign(p_subsurface_ior);
+  int subsurface_anisotropy_offset = compiler.stack_assign(p_subsurface_anisotropy);
 
   compiler.add_node(NODE_CLOSURE_BSDF,
                     compiler.encode_uchar4(closure,
@@ -2929,8 +2936,10 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
       __float_as_int(bc_default.y),
       __float_as_int(bc_default.z));
 
-  compiler.add_node(
-      clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID);
+  compiler.add_node(clearcoat_normal_offset,
+                    subsurface_radius_offset,
+                    subsurface_ior_offset,
+                    subsurface_anisotropy_offset);
 
   float3 ss_default = get_float3(subsurface_color_in->socket_type);
 
@@ -2953,6 +2962,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler)
           input("Metallic"),
           input("Subsurface"),
           input("Subsurface Radius"),
+          input("Subsurface IOR"),
+          input("Subsurface Anisotropy"),
           input("Specular"),
           input("Roughness"),
           input("Specular Tint"),
@@ -3048,16 +3059,16 @@ NODE_DEFINE(SubsurfaceScatteringNode)
   SOCKET_IN_NORMAL(normal, "Normal", zero_float3(), SocketType::LINK_NORMAL);
   SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
-  static NodeEnum falloff_enum;
-  falloff_enum.insert("cubic", CLOSURE_BSSRDF_CUBIC_ID);
-  falloff_enum.insert("gaussian", CLOSURE_BSSRDF_GAUSSIAN_ID);
-  falloff_enum.insert("burley", CLOSURE_BSSRDF_BURLEY_ID);
-  falloff_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
-  SOCKET_ENUM(falloff, "Falloff", falloff_enum, CLOSURE_BSSRDF_BURLEY_ID);
+  static NodeEnum method_enum;
+  method_enum.insert("random_walk_fixed_radius", CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+  method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
+  SOCKET_ENUM(method, "Method", method_enum, CLOSURE_BSSRDF_RANDOM_WALK_ID);
+
   SOCKET_IN_FLOAT(scale, "Scale", 0.01f);
   SOCKET_IN_VECTOR(radius, "Radius", make_float3(0.1f, 0.1f, 0.1f));
-  SOCKET_IN_FLOAT(sharpness, "Sharpness", 0.0f);
-  SOCKET_IN_FLOAT(texture_blur, "Texture Blur", 1.0f);
+
+  SOCKET_IN_FLOAT(subsurface_ior, "IOR", 1.4f);
+  SOCKET_IN_FLOAT(subsurface_anisotropy, "Anisotropy", 0.0f);
 
   SOCKET_OUT_CLOSURE(BSSRDF, "BSSRDF");
 
@@ -3066,20 +3077,19 @@ NODE_DEFINE(SubsurfaceScatteringNode)
 
 SubsurfaceScatteringNode::SubsurfaceScatteringNode() : BsdfNode(get_node_type())
 {
-  closure = falloff;
+  closure = method;
 }
 
 void SubsurfaceScatteringNode::compile(SVMCompiler &compiler)
 {
-  closure = falloff;
-  BsdfNode::compile(
-      compiler, input("Scale"), input("Texture Blur"), input("Radius"), input("Sharpness"));
+  closure = method;
+  BsdfNode::compile(compiler, input("Scale"), input("IOR"), input("Radius"), input("Anisotropy"));
 }
 
 void SubsurfaceScatteringNode::compile(OSLCompiler &compiler)
 {
-  closure = falloff;
-  compiler.parameter(this, "falloff");
+  closure = method;
+  compiler.parameter(this, "method");
   compiler.add(this, "node_subsurface_scattering");
 }
 
@@ -3786,20 +3796,6 @@ void GeometryNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_geometry");
 }
 
-int GeometryNode::get_group()
-{
-  ShaderOutput *out;
-  int result = ShaderNode::get_group();
-
-  /* Backfacing uses NODE_LIGHT_PATH */
-  out = output("Backfacing");
-  if (!out->links.empty()) {
-    result = max(result, NODE_GROUP_LEVEL_1);
-  }
-
-  return result;
-}
-
 /* TextureCoordinate */
 
 NODE_DEFINE(TextureCoordinateNode)
@@ -5926,33 +5922,33 @@ NODE_DEFINE(OutputAOVNode)
 OutputAOVNode::OutputAOVNode() : ShaderNode(get_node_type())
 {
   special_type = SHADER_SPECIAL_TYPE_OUTPUT_AOV;
-  slot = -1;
+  offset = -1;
 }
 
 void OutputAOVNode::simplify_settings(Scene *scene)
 {
-  slot = scene->film->get_aov_offset(scene, name.string(), is_color);
-  if (slot == -1) {
-    slot = scene->film->get_aov_offset(scene, name.string(), is_color);
+  offset = scene->film->get_aov_offset(scene, name.string(), is_color);
+  if (offset == -1) {
+    offset = scene->film->get_aov_offset(scene, name.string(), is_color);
   }
 
-  if (slot == -1 || is_color) {
+  if (offset == -1 || is_color) {
     input("Value")->disconnect();
   }
-  if (slot == -1 || !is_color) {
+  if (offset == -1 || !is_color) {
     input("Color")->disconnect();
   }
 }
 
 void OutputAOVNode::compile(SVMCompiler &compiler)
 {
-  assert(slot >= 0);
+  assert(offset >= 0);
 
   if (is_color) {
-    compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), slot);
+    compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), offset);
   }
   else {
-    compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), slot);
+    compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), offset);
   }
 }
 
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 3013e9b1866..22bdb06b059 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -143,10 +143,6 @@ class EnvironmentTextureNode : public ImageSlotTextureNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   virtual bool equals(const ShaderNode &other)
   {
@@ -170,11 +166,6 @@ class SkyTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(SkyTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeSkyType, sky_type)
   NODE_SOCKET_API(float3, sun_direction)
   NODE_SOCKET_API(float, turbidity)
@@ -224,18 +215,13 @@ class OutputAOVNode : public ShaderNode {
 
   NODE_SOCKET_API(ustring, name)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_4;
-  }
-
   /* Don't allow output node de-duplication. */
   virtual bool equals(const ShaderNode & /*other*/)
   {
     return false;
   }
 
-  int slot;
+  int offset;
   bool is_color;
 };
 
@@ -243,11 +229,6 @@ class GradientTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(GradientTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeGradientType, gradient_type)
   NODE_SOCKET_API(float3, vector)
 };
@@ -269,19 +250,14 @@ class VoronoiTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(VoronoiTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   virtual int get_feature()
   {
     int result = ShaderNode::get_feature();
     if (dimensions == 4) {
-      result |= NODE_FEATURE_VORONOI_EXTRA;
+      result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
     }
     else if (dimensions >= 2 && feature == NODE_VORONOI_SMOOTH_F1) {
-      result |= NODE_FEATURE_VORONOI_EXTRA;
+      result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
     }
     return result;
   }
@@ -301,11 +277,6 @@ class MusgraveTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(MusgraveTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(int, dimensions)
   NODE_SOCKET_API(NodeMusgraveType, musgrave_type)
   NODE_SOCKET_API(float, w)
@@ -322,11 +293,6 @@ class WaveTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(WaveTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeWaveType, wave_type)
   NODE_SOCKET_API(NodeWaveBandsDirection, bands_direction)
   NODE_SOCKET_API(NodeWaveRingsDirection, rings_direction)
@@ -345,11 +311,6 @@ class MagicTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(MagicTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(int, depth)
   NODE_SOCKET_API(float3, vector)
   NODE_SOCKET_API(float, scale)
@@ -364,11 +325,6 @@ class CheckerTextureNode : public TextureNode {
   NODE_SOCKET_API(float3, color1)
   NODE_SOCKET_API(float3, color2)
   NODE_SOCKET_API(float, scale)
-
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class BrickTextureNode : public TextureNode {
@@ -390,20 +346,11 @@ class BrickTextureNode : public TextureNode {
   NODE_SOCKET_API(float, brick_width)
   NODE_SOCKET_API(float, row_height)
   NODE_SOCKET_API(float3, vector)
-
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class PointDensityTextureNode : public ShaderNode {
  public:
   SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_4;
-  }
 
   ~PointDensityTextureNode();
   ShaderNode *clone(ShaderGraph *graph) const;
@@ -443,10 +390,6 @@ class IESLightNode : public TextureNode {
 
   ~IESLightNode();
   ShaderNode *clone(ShaderGraph *graph) const;
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(ustring, filename)
   NODE_SOCKET_API(ustring, ies)
@@ -464,10 +407,6 @@ class IESLightNode : public TextureNode {
 class WhiteNoiseTextureNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(WhiteNoiseTextureNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(int, dimensions)
   NODE_SOCKET_API(float3, vector)
@@ -477,10 +416,6 @@ class WhiteNoiseTextureNode : public ShaderNode {
 class MappingNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MappingNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
   void constant_fold(const ConstantFolder &folder);
 
   NODE_SOCKET_API(float3, vector)
@@ -546,6 +481,11 @@ class BsdfBaseNode : public ShaderNode {
     return false;
   }
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_BSDF;
+  }
+
  protected:
   ClosureType closure;
 };
@@ -606,6 +546,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
                ShaderInput *metallic,
                ShaderInput *subsurface,
                ShaderInput *subsurface_radius,
+               ShaderInput *subsurface_ior,
+               ShaderInput *subsurface_anisotropy,
                ShaderInput *specular,
                ShaderInput *roughness,
                ShaderInput *specular_tint,
@@ -622,6 +564,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
   NODE_SOCKET_API(float3, base_color)
   NODE_SOCKET_API(float3, subsurface_color)
   NODE_SOCKET_API(float3, subsurface_radius)
+  NODE_SOCKET_API(float, subsurface_ior)
+  NODE_SOCKET_API(float, subsurface_anisotropy)
   NODE_SOCKET_API(float, metallic)
   NODE_SOCKET_API(float, subsurface)
   NODE_SOCKET_API(float, specular)
@@ -758,14 +702,14 @@ class SubsurfaceScatteringNode : public BsdfNode {
   bool has_bssrdf_bump();
   ClosureType get_closure_type()
   {
-    return falloff;
+    return method;
   }
 
   NODE_SOCKET_API(float, scale)
   NODE_SOCKET_API(float3, radius)
-  NODE_SOCKET_API(float, sharpness)
-  NODE_SOCKET_API(float, texture_blur)
-  NODE_SOCKET_API(ClosureType, falloff)
+  NODE_SOCKET_API(float, subsurface_ior)
+  NODE_SOCKET_API(float, subsurface_anisotropy)
+  NODE_SOCKET_API(ClosureType, method)
 };
 
 class EmissionNode : public ShaderNode {
@@ -782,6 +726,11 @@ class EmissionNode : public ShaderNode {
     return true;
   }
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+  }
+
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, surface_mix_weight)
@@ -792,6 +741,11 @@ class BackgroundNode : public ShaderNode {
   SHADER_NODE_CLASS(BackgroundNode)
   void constant_fold(const ConstantFolder &folder);
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+  }
+
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, surface_mix_weight)
@@ -800,10 +754,6 @@ class BackgroundNode : public ShaderNode {
 class HoldoutNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(HoldoutNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual ClosureType get_closure_type()
   {
     return CLOSURE_HOLDOUT_ID;
@@ -821,13 +771,9 @@ class AmbientOcclusionNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-  virtual bool has_raytrace()
+  virtual int get_feature()
   {
-    return true;
+    return KERNEL_FEATURE_NODE_RAYTRACE;
   }
 
   NODE_SOCKET_API(float3, color)
@@ -845,13 +791,9 @@ class VolumeNode : public ShaderNode {
   SHADER_NODE_BASE_CLASS(VolumeNode)
 
   void compile(SVMCompiler &compiler, ShaderInput *param1, ShaderInput *param2);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual int get_feature()
   {
-    return ShaderNode::get_feature() | NODE_FEATURE_VOLUME;
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_VOLUME;
   }
   virtual ClosureType get_closure_type()
   {
@@ -1013,10 +955,6 @@ class UVMapNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(ustring, attribute)
   NODE_SOCKET_API(bool, from_dupli)
@@ -1025,10 +963,6 @@ class UVMapNode : public ShaderNode {
 class LightPathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(LightPathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class LightFalloffNode : public ShaderNode {
@@ -1038,10 +972,6 @@ class LightFalloffNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, smooth)
@@ -1050,10 +980,6 @@ class LightFalloffNode : public ShaderNode {
 class ObjectInfoNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(ObjectInfoNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class ParticleInfoNode : public ShaderNode {
@@ -1064,10 +990,6 @@ class ParticleInfoNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class HairInfoNode : public ShaderNode {
@@ -1083,13 +1005,9 @@ class HairInfoNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual int get_feature()
   {
-    return ShaderNode::get_feature() | NODE_FEATURE_HAIR;
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_HAIR;
   }
 };
 
@@ -1168,10 +1086,6 @@ class InvertNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(InvertNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, fac)
   NODE_SOCKET_API(float3, color)
@@ -1182,11 +1096,6 @@ class MixNode : public ShaderNode {
   SHADER_NODE_CLASS(MixNode)
   void constant_fold(const ConstantFolder &folder);
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API(NodeMix, mix_type)
   NODE_SOCKET_API(bool, use_clamp)
   NODE_SOCKET_API(float3, color1)
@@ -1198,10 +1107,6 @@ class CombineRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineRGBNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, r)
   NODE_SOCKET_API(float, g)
@@ -1212,10 +1117,6 @@ class CombineHSVNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineHSVNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, h)
   NODE_SOCKET_API(float, s)
@@ -1226,10 +1127,6 @@ class CombineXYZNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineXYZNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, x)
   NODE_SOCKET_API(float, y)
@@ -1240,10 +1137,6 @@ class GammaNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(GammaNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, gamma)
@@ -1253,10 +1146,6 @@ class BrightContrastNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(BrightContrastNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, bright)
@@ -1267,10 +1156,6 @@ class SeparateRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateRGBNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, color)
 };
@@ -1279,10 +1164,6 @@ class SeparateHSVNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateHSVNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, color)
 };
@@ -1291,10 +1172,6 @@ class SeparateXYZNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateXYZNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, vector)
 };
@@ -1333,10 +1210,6 @@ class CameraNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class FresnelNode : public ShaderNode {
@@ -1346,10 +1219,6 @@ class FresnelNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, normal)
   NODE_SOCKET_API(float, IOR)
@@ -1362,10 +1231,6 @@ class LayerWeightNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, normal)
   NODE_SOCKET_API(float, blend)
@@ -1378,10 +1243,6 @@ class WireframeNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, size)
   NODE_SOCKET_API(bool, use_pixel_size)
@@ -1390,10 +1251,6 @@ class WireframeNode : public ShaderNode {
 class WavelengthNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(WavelengthNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, wavelength)
 };
@@ -1402,10 +1259,6 @@ class BlackbodyNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(BlackbodyNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, temperature)
 };
@@ -1413,10 +1266,6 @@ class BlackbodyNode : public ShaderNode {
 class MapRangeNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MapRangeNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   void expand(ShaderGraph *graph);
 
   NODE_SOCKET_API(float, value)
@@ -1433,10 +1282,6 @@ class ClampNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(ClampNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   NODE_SOCKET_API(float, value)
   NODE_SOCKET_API(float, min)
   NODE_SOCKET_API(float, max)
@@ -1446,10 +1291,6 @@ class ClampNode : public ShaderNode {
 class MathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   void expand(ShaderGraph *graph);
   void constant_fold(const ConstantFolder &folder);
 
@@ -1463,10 +1304,6 @@ class MathNode : public ShaderNode {
 class NormalNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(NormalNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(float3, direction)
   NODE_SOCKET_API(float3, normal)
@@ -1475,10 +1312,6 @@ class NormalNode : public ShaderNode {
 class VectorMathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorMathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   void constant_fold(const ConstantFolder &folder);
 
   NODE_SOCKET_API(float3, vector1)
@@ -1492,10 +1325,6 @@ class VectorRotateNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorRotateNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   NODE_SOCKET_API(NodeVectorRotateType, rotate_type)
   NODE_SOCKET_API(bool, invert)
   NODE_SOCKET_API(float3, vector)
@@ -1509,11 +1338,6 @@ class VectorTransformNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorTransformNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API(NodeVectorTransformType, transform_type)
   NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_from)
   NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_to)
@@ -1530,7 +1354,7 @@ class BumpNode : public ShaderNode {
   }
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(bool, invert)
@@ -1549,11 +1373,6 @@ class CurvesNode : public ShaderNode {
   explicit CurvesNode(const NodeType *node_type);
   SHADER_NODE_BASE_CLASS(CurvesNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API_ARRAY(array<float3>, curves)
   NODE_SOCKET_API(float, min_x)
   NODE_SOCKET_API(float, max_x)
@@ -1583,10 +1402,6 @@ class RGBRampNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(RGBRampNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API_ARRAY(array<float3>, ramp)
   NODE_SOCKET_API_ARRAY(array<float>, ramp_alpha)
@@ -1656,10 +1471,6 @@ class NormalMapNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
   NODE_SOCKET_API(ustring, attribute)
@@ -1680,10 +1491,6 @@ class TangentNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(NodeTangentDirectionType, direction_type)
   NODE_SOCKET_API(NodeTangentAxis, axis)
@@ -1698,13 +1505,9 @@ class BevelNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-  virtual bool has_raytrace()
+  virtual int get_feature()
   {
-    return true;
+    return KERNEL_FEATURE_NODE_RAYTRACE;
   }
 
   NODE_SOCKET_API(float, radius)
@@ -1718,7 +1521,7 @@ class DisplacementNode : public ShaderNode {
   void constant_fold(const ConstantFolder &folder);
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
@@ -1739,7 +1542,7 @@ class VectorDisplacementNode : public ShaderNode {
   void constant_fold(const ConstantFolder &folder);
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index c88d94fe4c2..4637f8fe989 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -216,6 +216,10 @@ void Object::tag_update(Scene *scene)
     if (use_holdout_is_modified()) {
       flag |= ObjectManager::HOLDOUT_MODIFIED;
     }
+
+    if (is_shadow_catcher_is_modified()) {
+      scene->tag_shadow_catcher_modified();
+    }
   }
 
   if (geometry) {
@@ -273,14 +277,7 @@ bool Object::is_traceable() const
 
 uint Object::visibility_for_tracing() const
 {
-  uint trace_visibility = visibility;
-  if (is_shadow_catcher) {
-    trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER;
-  }
-  else {
-    trace_visibility &= ~PATH_RAY_SHADOW_CATCHER;
-  }
-  return trace_visibility;
+  return SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility & PATH_RAY_ALL_VISIBILITY);
 }
 
 float Object::compute_volume_step_size() const
@@ -680,7 +677,7 @@ void ObjectManager::device_update(Device *device,
 
   /* prepare for static BVH building */
   /* todo: do before to support getting object level coords? */
-  if (scene->params.bvh_type == SceneParams::BVH_STATIC) {
+  if (scene->params.bvh_type == BVH_TYPE_STATIC) {
     scoped_callback_timer timer([scene](double time) {
       if (scene->update_stats) {
         scene->update_stats->object.times.add_entry(
@@ -932,6 +929,11 @@ void ObjectManager::tag_update(Scene *scene, uint32_t flag)
   }
 
   scene->light_manager->tag_update(scene, LightManager::OBJECT_MANAGER);
+
+  /* Integrator's shadow catcher settings depends on object visibility settings. */
+  if (flag & (OBJECT_ADDED | OBJECT_REMOVED | OBJECT_MODIFIED)) {
+    scene->integrator->tag_update(scene, Integrator::OBJECT_MANAGER);
+  }
 }
 
 bool ObjectManager::need_update() const
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 7dc79f48145..d28b222c10e 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -113,7 +113,7 @@ void OSLShaderManager::device_update_specific(Device *device,
   scene->image_manager->set_osl_texture_system((void *)ts);
 
   /* create shaders */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
   Shader *background_shader = scene->background->get_shader(scene);
 
   foreach (Shader *shader, scene->shaders) {
@@ -174,7 +174,7 @@ void OSLShaderManager::device_update_specific(Device *device,
 
 void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
 {
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   device_free_common(device, dscene, scene);
 
@@ -257,25 +257,36 @@ void OSLShaderManager::shading_system_init()
 
     /* our own ray types */
     static const char *raytypes[] = {
-        "camera",      /* PATH_RAY_CAMERA */
-        "reflection",  /* PATH_RAY_REFLECT */
-        "refraction",  /* PATH_RAY_TRANSMIT */
-        "diffuse",     /* PATH_RAY_DIFFUSE */
-        "glossy",      /* PATH_RAY_GLOSSY */
-        "singular",    /* PATH_RAY_SINGULAR */
-        "transparent", /* PATH_RAY_TRANSPARENT */
-
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */
-
-        "__unused__",  "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
-        "__unused__",
-
-        "__unused__",  "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-        "__unused__",  "__unused__",       "__unused__", "__unused__",
-        "__unused__",  "__unused__",       "__unused__",
+        "camera",         /* PATH_RAY_CAMERA */
+        "reflection",     /* PATH_RAY_REFLECT */
+        "refraction",     /* PATH_RAY_TRANSMIT */
+        "diffuse",        /* PATH_RAY_DIFFUSE */
+        "glossy",         /* PATH_RAY_GLOSSY */
+        "singular",       /* PATH_RAY_SINGULAR */
+        "transparent",    /* PATH_RAY_TRANSPARENT */
+        "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+
+        "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+
+        "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+        "__unused__", /* PATH_RAY_MIS_SKIP */
+
+        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+
+        "__unused__", /* PATH_RAY_SINGLE_PASS_DONE */
+        "__unused__", /* PATH_RAY_TRANSPARENT_BACKGROUND */
+        "__unused__", /* PATH_RAY_TERMINATE_IMMEDIATE */
+        "__unused__", /* PATH_RAY_TERMINATE_AFTER_TRANSPARENT */
+        "__unused__", /* PATH_RAY_EMISSION */
+        "__unused__", /* PATH_RAY_SUBSURFACE */
+        "__unused__", /* PATH_RAY_DENOISING_FEATURES */
+        "__unused__", /* PATH_RAY_REFLECT_PASS */
+        "__unused__", /* PATH_RAY_TRANSMISSION_PASS */
+        "__unused__", /* PATH_RAY_VOLUME_PASS */
+        "__unused__", /* PATH_RAY_SHADOW_FOR_LIGHT */
+        "__unused__", /* PATH_RAY_SHADOW_CATCHER_HIT */
+        "__unused__", /* PATH_RAY_SHADOW_CATCHER_PASS */
     };
 
     const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
@@ -758,7 +769,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
         current_shader->has_surface_bssrdf = true;
         current_shader->has_bssrdf_bump = true; /* can't detect yet */
       }
-      current_shader->has_bump = true; /* can't detect yet */
+      current_shader->has_bump = true;             /* can't detect yet */
+      current_shader->has_surface_raytrace = true; /* can't detect yet */
     }
 
     if (node->has_spatial_varying()) {
@@ -1054,6 +1066,8 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet &nodes)
               current_shader->has_surface_emission = true;
             if (node->has_surface_transparent())
               current_shader->has_surface_transparent = true;
+            if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+              current_shader->has_surface_raytrace = true;
             if (node->has_spatial_varying())
               current_shader->has_surface_spatial_varying = true;
             if (node->has_surface_bssrdf()) {
diff --git a/intern/cycles/render/pass.cpp b/intern/cycles/render/pass.cpp
new file mode 100644
index 00000000000..27ad7c0db97
--- /dev/null
+++ b/intern/cycles/render/pass.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/pass.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type)
+{
+  const int type_int = static_cast<int>(type);
+
+  const NodeEnum *type_enum = Pass::get_type_enum();
+
+  if (!type_enum->exists(type_int)) {
+    LOG(DFATAL) << "Unhandled pass type " << static_cast<int>(type) << ", not supposed to happen.";
+    return "UNKNOWN";
+  }
+
+  return (*type_enum)[type_int].c_str();
+}
+
+const char *pass_mode_as_string(PassMode mode)
+{
+  switch (mode) {
+    case PassMode::NOISY:
+      return "NOISY";
+    case PassMode::DENOISED:
+      return "DENOISED";
+  }
+
+  LOG(DFATAL) << "Unhandled pass mode " << static_cast<int>(mode) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, PassMode mode)
+{
+  os << pass_mode_as_string(mode);
+  return os;
+}
+
+const NodeEnum *Pass::get_type_enum()
+{
+  static NodeEnum pass_type_enum;
+
+  if (pass_type_enum.empty()) {
+
+    /* Light Passes. */
+    pass_type_enum.insert("combined", PASS_COMBINED);
+    pass_type_enum.insert("emission", PASS_EMISSION);
+    pass_type_enum.insert("background", PASS_BACKGROUND);
+    pass_type_enum.insert("ao", PASS_AO);
+    pass_type_enum.insert("shadow", PASS_SHADOW);
+    pass_type_enum.insert("diffuse", PASS_DIFFUSE);
+    pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
+    pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
+    pass_type_enum.insert("glossy", PASS_GLOSSY);
+    pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
+    pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
+    pass_type_enum.insert("transmission", PASS_TRANSMISSION);
+    pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
+    pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
+    pass_type_enum.insert("volume", PASS_VOLUME);
+    pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
+    pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
+
+    /* Data passes. */
+    pass_type_enum.insert("depth", PASS_DEPTH);
+    pass_type_enum.insert("position", PASS_POSITION);
+    pass_type_enum.insert("normal", PASS_NORMAL);
+    pass_type_enum.insert("roughness", PASS_ROUGHNESS);
+    pass_type_enum.insert("uv", PASS_UV);
+    pass_type_enum.insert("object_id", PASS_OBJECT_ID);
+    pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
+    pass_type_enum.insert("motion", PASS_MOTION);
+    pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
+    pass_type_enum.insert("render_time", PASS_RENDER_TIME);
+    pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
+    pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
+    pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
+    pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
+    pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
+    pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
+    pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
+    pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
+    pass_type_enum.insert("mist", PASS_MIST);
+    pass_type_enum.insert("denoising_normal", PASS_DENOISING_NORMAL);
+    pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO);
+
+    pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
+    pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+    pass_type_enum.insert("shadow_catcher_matte", PASS_SHADOW_CATCHER_MATTE);
+
+    pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
+    pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
+  }
+
+  return &pass_type_enum;
+}
+
+const NodeEnum *Pass::get_mode_enum()
+{
+  static NodeEnum pass_mode_enum;
+
+  if (pass_mode_enum.empty()) {
+    pass_mode_enum.insert("noisy", static_cast<int>(PassMode::NOISY));
+    pass_mode_enum.insert("denoised", static_cast<int>(PassMode::DENOISED));
+  }
+
+  return &pass_mode_enum;
+}
+
+NODE_DEFINE(Pass)
+{
+  NodeType *type = NodeType::add("pass", create);
+
+  const NodeEnum *pass_type_enum = get_type_enum();
+  const NodeEnum *pass_mode_enum = get_mode_enum();
+
+  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+  SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+  SOCKET_STRING(name, "Name", ustring());
+  SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
+
+  return type;
+}
+
+Pass::Pass() : Node(get_node_type()), is_auto_(false)
+{
+}
+
+PassInfo Pass::get_info() const
+{
+  return get_info(type, include_albedo);
+}
+
+bool Pass::is_written() const
+{
+  return get_info().is_written;
+}
+
+PassInfo Pass::get_info(const PassType type, const bool include_albedo)
+{
+  PassInfo pass_info;
+
+  pass_info.use_filter = true;
+  pass_info.use_exposure = false;
+  pass_info.divide_type = PASS_NONE;
+  pass_info.use_compositing = false;
+  pass_info.use_denoising_albedo = true;
+
+  switch (type) {
+    case PASS_NONE:
+      pass_info.num_components = 0;
+      break;
+    case PASS_COMBINED:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = true;
+      pass_info.support_denoise = true;
+      break;
+    case PASS_DEPTH:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      break;
+    case PASS_MIST:
+      pass_info.num_components = 1;
+      break;
+    case PASS_POSITION:
+      pass_info.num_components = 3;
+      break;
+    case PASS_NORMAL:
+      pass_info.num_components = 3;
+      break;
+    case PASS_ROUGHNESS:
+      pass_info.num_components = 1;
+      break;
+    case PASS_UV:
+      pass_info.num_components = 3;
+      break;
+    case PASS_MOTION:
+      pass_info.num_components = 4;
+      pass_info.divide_type = PASS_MOTION_WEIGHT;
+      break;
+    case PASS_MOTION_WEIGHT:
+      pass_info.num_components = 1;
+      break;
+    case PASS_OBJECT_ID:
+    case PASS_MATERIAL_ID:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      break;
+
+    case PASS_EMISSION:
+    case PASS_BACKGROUND:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      break;
+    case PASS_AO:
+      pass_info.num_components = 3;
+      break;
+    case PASS_SHADOW:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = false;
+      break;
+    case PASS_RENDER_TIME:
+      /* This pass is handled entirely on the host side. */
+      pass_info.num_components = 0;
+      break;
+
+    case PASS_DIFFUSE_COLOR:
+    case PASS_GLOSSY_COLOR:
+    case PASS_TRANSMISSION_COLOR:
+      pass_info.num_components = 3;
+      break;
+    case PASS_DIFFUSE:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_DIFFUSE_DIRECT;
+      pass_info.indirect_type = PASS_DIFFUSE_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_DIFFUSE_DIRECT:
+    case PASS_DIFFUSE_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_GLOSSY:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_GLOSSY_DIRECT;
+      pass_info.indirect_type = PASS_GLOSSY_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_GLOSSY_DIRECT:
+    case PASS_GLOSSY_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_TRANSMISSION:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_TRANSMISSION_DIRECT;
+      pass_info.indirect_type = PASS_TRANSMISSION_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_TRANSMISSION_DIRECT:
+    case PASS_TRANSMISSION_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_VOLUME:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_VOLUME_DIRECT;
+      pass_info.indirect_type = PASS_VOLUME_INDIRECT;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_VOLUME_DIRECT:
+    case PASS_VOLUME_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      break;
+
+    case PASS_CRYPTOMATTE:
+      pass_info.num_components = 4;
+      break;
+
+    case PASS_DENOISING_NORMAL:
+      pass_info.num_components = 3;
+      break;
+    case PASS_DENOISING_ALBEDO:
+      pass_info.num_components = 3;
+      break;
+
+    case PASS_SHADOW_CATCHER:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.use_compositing = true;
+      pass_info.use_denoising_albedo = false;
+      pass_info.support_denoise = true;
+      break;
+    case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+      pass_info.num_components = 1;
+      break;
+    case PASS_SHADOW_CATCHER_MATTE:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = true;
+      pass_info.support_denoise = true;
+      /* Without shadow catcher approximation compositing is not needed.
+       * Since we don't know here whether approximation is used or not, leave the decision up to
+       * the caller which will know that. */
+      break;
+
+    case PASS_ADAPTIVE_AUX_BUFFER:
+      pass_info.num_components = 4;
+      break;
+    case PASS_SAMPLE_COUNT:
+      pass_info.num_components = 1;
+      pass_info.use_exposure = false;
+      break;
+
+    case PASS_AOV_COLOR:
+      pass_info.num_components = 3;
+      break;
+    case PASS_AOV_VALUE:
+      pass_info.num_components = 1;
+      break;
+
+    case PASS_BAKE_PRIMITIVE:
+    case PASS_BAKE_DIFFERENTIAL:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = false;
+      pass_info.use_filter = false;
+      break;
+
+    case PASS_CATEGORY_LIGHT_END:
+    case PASS_CATEGORY_DATA_END:
+    case PASS_CATEGORY_BAKE_END:
+    case PASS_NUM:
+      LOG(DFATAL) << "Unexpected pass type is used " << type;
+      pass_info.num_components = 0;
+      break;
+  }
+
+  return pass_info;
+}
+
+bool Pass::contains(const vector<Pass *> &passes, PassType type)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_type() != type) {
+      continue;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, const string &name)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_name() == name) {
+      return pass;
+    }
+  }
+
+  return nullptr;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, PassType type, PassMode mode)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_type() != type || pass->get_mode() != mode) {
+      continue;
+    }
+
+    return pass;
+  }
+
+  return nullptr;
+}
+
+int Pass::get_offset(const vector<Pass *> &passes, const Pass *pass)
+{
+  int pass_offset = 0;
+
+  for (const Pass *current_pass : passes) {
+    /* Note that pass name is allowed to be empty. This is why we check for type and mode. */
+    if (current_pass->get_type() == pass->get_type() &&
+        current_pass->get_mode() == pass->get_mode() &&
+        current_pass->get_name() == pass->get_name()) {
+      if (current_pass->is_written()) {
+        return pass_offset;
+      }
+      else {
+        return PASS_UNUSED;
+      }
+    }
+    if (current_pass->is_written()) {
+      pass_offset += current_pass->get_info().num_components;
+    }
+  }
+
+  return PASS_UNUSED;
+}
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass)
+{
+  os << "type: " << pass_type_as_string(pass.get_type());
+  os << ", name: \"" << pass.get_name() << "\"";
+  os << ", mode: " << pass.get_mode();
+  os << ", is_written: " << string_from_bool(pass.is_written());
+
+  return os;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/pass.h b/intern/cycles/render/pass.h
new file mode 100644
index 00000000000..82230c62cb0
--- /dev/null
+++ b/intern/cycles/render/pass.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>  // NOLINT
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+#include "kernel/kernel_types.h"
+
+#include "graph/node.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type);
+
+enum class PassMode {
+  NOISY,
+  DENOISED,
+};
+const char *pass_mode_as_string(PassMode mode);
+std::ostream &operator<<(std::ostream &os, PassMode mode);
+
+struct PassInfo {
+  int num_components = -1;
+  bool use_filter = false;
+  bool use_exposure = false;
+  bool is_written = true;
+  PassType divide_type = PASS_NONE;
+  PassType direct_type = PASS_NONE;
+  PassType indirect_type = PASS_NONE;
+
+  /* Pass access for read can not happen directly and needs some sort of compositing (for example,
+   * light passes due to divide_type, or shadow catcher pass. */
+  bool use_compositing = false;
+
+  /* Used to disable albedo pass for denoising.
+   * Light and shadow catcher passes should not have discontinuity in the denoised result based on
+   * the underlying albedo. */
+  bool use_denoising_albedo = true;
+
+  /* Pass supports denoising. */
+  bool support_denoise = false;
+};
+
+class Pass : public Node {
+ public:
+  NODE_DECLARE
+
+  NODE_SOCKET_API(PassType, type)
+  NODE_SOCKET_API(PassMode, mode)
+  NODE_SOCKET_API(ustring, name)
+  NODE_SOCKET_API(bool, include_albedo)
+
+  Pass();
+
+  PassInfo get_info() const;
+
+  /* The pass is written by the render pipeline (kernel or denoiser). If the pass is written it
+   * will have pixels allocated in a RenderBuffer. Passes which are not written do not have their
+   * pixels allocated to save memory. */
+  bool is_written() const;
+
+ protected:
+  /* The has been created automatically as a requirement to various rendering functionality (such
+   * as adaptive sampling). */
+  bool is_auto_;
+
+ public:
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_mode_enum();
+
+  static PassInfo get_info(PassType type, const bool include_albedo = false);
+
+  static bool contains(const vector<Pass *> &passes, PassType type);
+
+  /* Returns nullptr if there is no pass with the given name or type+mode. */
+  static const Pass *find(const vector<Pass *> &passes, const string &name);
+  static const Pass *find(const vector<Pass *> &passes,
+                          PassType type,
+                          PassMode mode = PassMode::NOISY);
+
+  /* Returns PASS_UNUSED if there is no corresponding pass. */
+  static int get_offset(const vector<Pass *> &passes, const Pass *pass);
+
+  friend class Film;
+};
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index c4e7d2c79d6..a4b030190dc 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -163,12 +163,15 @@ void Scene::free_memory(bool final)
     delete p;
   foreach (Light *l, lights)
     delete l;
+  foreach (Pass *p, passes)
+    delete p;
 
   geometry.clear();
   objects.clear();
   lights.clear();
   particle_systems.clear();
   procedurals.clear();
+  passes.clear();
 
   if (device) {
     camera->device_free(device, &dscene, this);
@@ -253,7 +256,6 @@ void Scene::device_update(Device *device_, Progress &progress)
    * - Camera may be used for adaptive subdivision.
    * - Displacement shader must have all shader data available.
    * - Light manager needs lookup tables and final mesh data to compute emission CDF.
-   * - Film needs light manager to run for use_light_visibility
    * - Lookup tables are done a second time to handle film tables
    */
 
@@ -469,88 +471,110 @@ void Scene::enable_update_stats()
   }
 }
 
-DeviceRequestedFeatures Scene::get_requested_device_features()
+void Scene::update_kernel_features()
 {
-  DeviceRequestedFeatures requested_features;
+  if (!need_update()) {
+    return;
+  }
 
-  shader_manager->get_requested_features(this, &requested_features);
+  /* These features are not being tweaked as often as shaders,
+   * so could be done selective magic for the viewport as well. */
+  uint kernel_features = shader_manager->get_kernel_features(this);
 
-  /* This features are not being tweaked as often as shaders,
-   * so could be done selective magic for the viewport as well.
-   */
   bool use_motion = need_motion() == Scene::MotionType::MOTION_BLUR;
-  requested_features.use_hair = false;
-  requested_features.use_hair_thick = (params.hair_shape == CURVE_THICK);
-  requested_features.use_object_motion = false;
-  requested_features.use_camera_motion = use_motion && camera->use_motion();
+  kernel_features |= KERNEL_FEATURE_PATH_TRACING;
+  if (params.hair_shape == CURVE_THICK) {
+    kernel_features |= KERNEL_FEATURE_HAIR_THICK;
+  }
+  if (use_motion && camera->use_motion()) {
+    kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+  }
   foreach (Object *object, objects) {
     Geometry *geom = object->get_geometry();
     if (use_motion) {
-      requested_features.use_object_motion |= object->use_motion() | geom->get_use_motion_blur();
-      requested_features.use_camera_motion |= geom->get_use_motion_blur();
+      if (object->use_motion() || geom->get_use_motion_blur()) {
+        kernel_features |= KERNEL_FEATURE_OBJECT_MOTION;
+      }
+      if (geom->get_use_motion_blur()) {
+        kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+      }
     }
     if (object->get_is_shadow_catcher()) {
-      requested_features.use_shadow_tricks = true;
+      kernel_features |= KERNEL_FEATURE_SHADOW_CATCHER;
     }
     if (geom->is_mesh()) {
       Mesh *mesh = static_cast<Mesh *>(geom);
 #ifdef WITH_OPENSUBDIV
       if (mesh->get_subdivision_type() != Mesh::SUBDIVISION_NONE) {
-        requested_features.use_patch_evaluation = true;
+        kernel_features |= KERNEL_FEATURE_PATCH_EVALUATION;
       }
 #endif
-      requested_features.use_true_displacement |= mesh->has_true_displacement();
     }
     else if (geom->is_hair()) {
-      requested_features.use_hair = true;
+      kernel_features |= KERNEL_FEATURE_HAIR;
     }
   }
 
-  requested_features.use_background_light = light_manager->has_background_light(this);
-
-  requested_features.use_baking = bake_manager->get_baking();
-  requested_features.use_integrator_branched = (integrator->get_method() ==
-                                                Integrator::BRANCHED_PATH);
-  if (film->get_denoising_data_pass()) {
-    requested_features.use_denoising = true;
-    requested_features.use_shadow_tricks = true;
+  if (bake_manager->get_baking()) {
+    kernel_features |= KERNEL_FEATURE_BAKING;
   }
 
-  return requested_features;
-}
+  kernel_features |= film->get_kernel_features(this);
 
-bool Scene::update(Progress &progress, bool &kernel_switch_needed)
-{
-  /* update scene */
-  if (need_update()) {
-    /* Update max_closures. */
-    KernelIntegrator *kintegrator = &dscene.data.integrator;
-    if (params.background) {
-      kintegrator->max_closures = get_max_closure_count();
-    }
-    else {
-      /* Currently viewport render is faster with higher max_closures, needs investigating. */
-      kintegrator->max_closures = MAX_CLOSURE;
-    }
-
-    /* Load render kernels, before device update where we upload data to the GPU. */
-    bool new_kernels_needed = load_kernels(progress, false);
-
-    progress.set_status("Updating Scene");
-    MEM_GUARDED_CALL(&progress, device_update, device, progress);
+  dscene.data.kernel_features = kernel_features;
 
-    DeviceKernelStatus kernel_switch_status = device->get_active_kernel_switch_state();
-    kernel_switch_needed = kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE ||
-                           kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
-    if (new_kernels_needed || kernel_switch_needed) {
-      progress.set_kernel_status("Compiling render kernels");
-      device->wait_for_availability(loaded_kernel_features);
-      progress.set_kernel_status("");
-    }
+  /* Currently viewport render is faster with higher max_closures, needs investigating. */
+  const uint max_closures = (params.background) ? get_max_closure_count() : MAX_CLOSURE;
+  dscene.data.max_closures = max_closures;
+  dscene.data.max_shaders = shaders.size();
+}
 
-    return true;
+bool Scene::update(Progress &progress)
+{
+  if (!need_update()) {
+    return false;
   }
-  return false;
+
+  /* Load render kernels, before device update where we upload data to the GPU. */
+  load_kernels(progress, false);
+
+  /* Upload scene data to the GPU. */
+  progress.set_status("Updating Scene");
+  MEM_GUARDED_CALL(&progress, device_update, device, progress);
+
+  return true;
+}
+
+static void log_kernel_features(const uint features)
+{
+  VLOG(2) << "Requested features:\n";
+  VLOG(2) << "Use BSDF " << string_from_bool(features & KERNEL_FEATURE_NODE_BSDF) << "\n";
+  VLOG(2) << "Use Principled BSDF " << string_from_bool(features & KERNEL_FEATURE_PRINCIPLED)
+          << "\n";
+  VLOG(2) << "Use Emission " << string_from_bool(features & KERNEL_FEATURE_NODE_EMISSION) << "\n";
+  VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_NODE_VOLUME) << "\n";
+  VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_NODE_HAIR) << "\n";
+  VLOG(2) << "Use Bump " << string_from_bool(features & KERNEL_FEATURE_NODE_BUMP) << "\n";
+  VLOG(2) << "Use Voronoi " << string_from_bool(features & KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+          << "\n";
+  VLOG(2) << "Use Shader Raytrace " << string_from_bool(features & KERNEL_FEATURE_NODE_RAYTRACE)
+          << "\n";
+  VLOG(2) << "Use Transparent " << string_from_bool(features & KERNEL_FEATURE_TRANSPARENT) << "\n";
+  VLOG(2) << "Use Denoising " << string_from_bool(features & KERNEL_FEATURE_DENOISING) << "\n";
+  VLOG(2) << "Use Path Tracing " << string_from_bool(features & KERNEL_FEATURE_PATH_TRACING)
+          << "\n";
+  VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_HAIR) << "\n";
+  VLOG(2) << "Use Object Motion " << string_from_bool(features & KERNEL_FEATURE_OBJECT_MOTION)
+          << "\n";
+  VLOG(2) << "Use Camera Motion " << string_from_bool(features & KERNEL_FEATURE_CAMERA_MOTION)
+          << "\n";
+  VLOG(2) << "Use Baking " << string_from_bool(features & KERNEL_FEATURE_BAKING) << "\n";
+  VLOG(2) << "Use Subsurface " << string_from_bool(features & KERNEL_FEATURE_SUBSURFACE) << "\n";
+  VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_VOLUME) << "\n";
+  VLOG(2) << "Use Patch Evaluation "
+          << string_from_bool(features & KERNEL_FEATURE_PATCH_EVALUATION) << "\n";
+  VLOG(2) << "Use Shadow Catcher " << string_from_bool(features & KERNEL_FEATURE_SHADOW_CATCHER)
+          << "\n";
 }
 
 bool Scene::load_kernels(Progress &progress, bool lock_scene)
@@ -560,15 +584,15 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
     scene_lock = thread_scoped_lock(mutex);
   }
 
-  DeviceRequestedFeatures requested_features = get_requested_device_features();
+  const uint kernel_features = dscene.data.kernel_features;
 
-  if (!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
+  if (!kernels_loaded || loaded_kernel_features != kernel_features) {
     progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
     scoped_timer timer;
 
-    VLOG(2) << "Requested features:\n" << requested_features;
-    if (!device->load_kernels(requested_features)) {
+    log_kernel_features(kernel_features);
+    if (!device->load_kernels(kernel_features)) {
       string message = device->error_message();
       if (message.empty())
         message = "Failed loading render kernel, see console for errors";
@@ -580,7 +604,7 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
     }
 
     kernels_loaded = true;
-    loaded_kernel_features = requested_features;
+    loaded_kernel_features = kernel_features;
     return true;
   }
   return false;
@@ -618,6 +642,28 @@ int Scene::get_max_closure_count()
   return max_closure_global;
 }
 
+bool Scene::has_shadow_catcher()
+{
+  if (shadow_catcher_modified_) {
+    has_shadow_catcher_ = false;
+    for (Object *object : objects) {
+      if (object->get_is_shadow_catcher()) {
+        has_shadow_catcher_ = true;
+        break;
+      }
+    }
+
+    shadow_catcher_modified_ = false;
+  }
+
+  return has_shadow_catcher_;
+}
+
+void Scene::tag_shadow_catcher_modified()
+{
+  shadow_catcher_modified_ = true;
+}
+
 template<> Light *Scene::create_node<Light>()
 {
   Light *node = new Light();
@@ -694,6 +740,15 @@ template<> AlembicProcedural *Scene::create_node<AlembicProcedural>()
 #endif
 }
 
+template<> Pass *Scene::create_node<Pass>()
+{
+  Pass *node = new Pass();
+  node->set_owner(this);
+  passes.push_back(node);
+  film->tag_modified();
+  return node;
+}
+
 template<typename T> void delete_node_from_array(vector<T> &nodes, T node)
 {
   for (size_t i = 0; i < nodes.size(); ++i) {
@@ -779,6 +834,12 @@ template<> void Scene::delete_node_impl(AlembicProcedural *node)
 #endif
 }
 
+template<> void Scene::delete_node_impl(Pass *node)
+{
+  delete_node_from_array(passes, node);
+  film->tag_modified();
+}
+
 template<typename T>
 static void remove_nodes_in_set(const set<T *> &nodes_set,
                                 vector<T *> &nodes_array,
@@ -842,4 +903,10 @@ template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOw
   procedural_manager->tag_update();
 }
 
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner)
+{
+  remove_nodes_in_set(nodes, passes, owner);
+  film->tag_modified();
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 7d8a6774381..cf4a3ba6b12 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -128,7 +128,7 @@ class DeviceScene {
   device_vector<float> lookup_table;
 
   /* integrator */
-  device_vector<uint> sample_pattern_lut;
+  device_vector<float> sample_pattern_lut;
 
   /* ies lights */
   device_vector<float> ies_lights;
@@ -142,27 +142,6 @@ class DeviceScene {
 
 class SceneParams {
  public:
-  /* Type of BVH, in terms whether it is supported dynamic updates of meshes
-   * or whether modifying geometry requires full BVH rebuild.
-   */
-  enum BVHType {
-    /* BVH supports dynamic updates of geometry.
-     *
-     * Faster for updating BVH tree when doing modifications in viewport,
-     * but slower for rendering.
-     */
-    BVH_DYNAMIC = 0,
-    /* BVH tree is calculated for specific scene, updates in geometry
-     * requires full tree rebuild.
-     *
-     * Slower to update BVH tree when modifying objects in viewport, also
-     * slower to build final BVH tree but gives best possible render speed.
-     */
-    BVH_STATIC = 1,
-
-    BVH_NUM_TYPES,
-  };
-
   ShadingSystem shadingsystem;
 
   /* Requested BVH layout.
@@ -186,7 +165,7 @@ class SceneParams {
   {
     shadingsystem = SHADINGSYSTEM_SVM;
     bvh_layout = BVH_LAYOUT_BVH2;
-    bvh_type = BVH_DYNAMIC;
+    bvh_type = BVH_TYPE_DYNAMIC;
     use_bvh_spatial_split = false;
     use_bvh_unaligned_nodes = true;
     num_bvh_time_steps = 0;
@@ -196,7 +175,7 @@ class SceneParams {
     background = true;
   }
 
-  bool modified(const SceneParams &params)
+  bool modified(const SceneParams &params) const
   {
     return !(shadingsystem == params.shadingsystem && bvh_layout == params.bvh_layout &&
              bvh_type == params.bvh_type &&
@@ -236,7 +215,7 @@ class Scene : public NodeOwner {
   vector<Shader *> shaders;
   vector<Light *> lights;
   vector<ParticleSystem *> particle_systems;
-  vector<Pass> passes;
+  vector<Pass *> passes;
   vector<Procedural *> procedurals;
 
   /* data managers */
@@ -291,7 +270,11 @@ class Scene : public NodeOwner {
 
   void enable_update_stats();
 
-  bool update(Progress &progress, bool &kernel_switch_needed);
+  void update_kernel_features();
+  bool update(Progress &progress);
+
+  bool has_shadow_catcher();
+  void tag_shadow_catcher_modified();
 
   /* This function is used to create a node of a specified type instead of
    * calling 'new', and sets the scene as the owner of the node.
@@ -348,13 +331,12 @@ class Scene : public NodeOwner {
   void free_memory(bool final);
 
   bool kernels_loaded;
-  DeviceRequestedFeatures loaded_kernel_features;
+  uint loaded_kernel_features;
 
   bool load_kernels(Progress &progress, bool lock_scene = true);
 
-  /* ** Split kernel routines ** */
-
-  DeviceRequestedFeatures get_requested_device_features();
+  bool has_shadow_catcher_ = false;
+  bool shadow_catcher_modified_ = true;
 
   /* Maximum number of closure during session lifetime. */
   int max_closure_global;
@@ -384,6 +366,8 @@ template<> Shader *Scene::create_node<Shader>();
 
 template<> AlembicProcedural *Scene::create_node<AlembicProcedural>();
 
+template<> Pass *Scene::create_node<Pass>();
+
 template<> void Scene::delete_node_impl(Light *node);
 
 template<> void Scene::delete_node_impl(Mesh *node);
@@ -404,6 +388,8 @@ template<> void Scene::delete_node_impl(Procedural *node);
 
 template<> void Scene::delete_node_impl(AlembicProcedural *node);
 
+template<> void Scene::delete_node_impl(Pass *node);
+
 template<> void Scene::delete_nodes(const set<Light *> &nodes, const NodeOwner *owner);
 
 template<> void Scene::delete_nodes(const set<Geometry *> &nodes, const NodeOwner *owner);
@@ -416,6 +402,8 @@ template<> void Scene::delete_nodes(const set<Shader *> &nodes, const NodeOwner
 
 template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOwner *owner);
 
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner);
+
 CCL_NAMESPACE_END
 
 #endif /*  __SCENE_H__ */
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 1b91c49f0ea..84407f8e6dd 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -17,10 +17,15 @@
 #include <limits.h>
 #include <string.h>
 
+#include "device/cpu/device.h"
 #include "device/device.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "integrator/path_trace.h"
+#include "render/background.h"
 #include "render/bake.h"
 #include "render/buffers.h"
 #include "render/camera.h"
+#include "render/gpu_display.h"
 #include "render/graph.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -39,70 +44,63 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note about  preserve_tile_device option for tile manager:
- * progressive refine and viewport rendering does requires tiles to
- * always be allocated for the same device
- */
-Session::Session(const SessionParams &params_)
-    : params(params_),
-      tile_manager(params.progressive,
-                   params.samples,
-                   params.tile_size,
-                   params.start_resolution,
-                   params.background == false || params.progressive_refine,
-                   params.background,
-                   params.tile_order,
-                   max(params.device.multi_devices.size(), 1),
-                   params.pixel_size),
-      stats(),
-      profiler()
+Session::Session(const SessionParams &params_, const SceneParams &scene_params)
+    : params(params_), render_scheduler_(tile_manager_, params)
 {
-  device_use_gl_ = ((params.device.type != DEVICE_CPU) && !params.background);
-
   TaskScheduler::init(params.threads);
 
-  session_thread_ = NULL;
-  scene = NULL;
-
-  reset_time_ = 0.0;
-  last_update_time_ = 0.0;
+  session_thread_ = nullptr;
 
   delayed_reset_.do_reset = false;
-  delayed_reset_.samples = 0;
-
-  display_outdated_ = false;
-  gpu_draw_ready_ = false;
-  gpu_need_display_buffer_update_ = false;
 
   pause_ = false;
   cancel_ = false;
   new_work_added_ = false;
 
-  buffers = NULL;
-  display = NULL;
+  device = Device::create(params.device, stats, profiler);
 
-  /* Validate denoising parameters. */
-  set_denoising(params.denoising);
+  scene = new Scene(scene_params, device);
 
-  /* Create CPU/GPU devices. */
-  device = Device::create(params.device, stats, profiler, params.background);
-
-  if (!device->error_message().empty()) {
-    progress.set_error(device->error_message());
-    return;
-  }
+  /* Configure path tracer. */
+  path_trace_ = make_unique<PathTrace>(
+      device, scene->film, &scene->dscene, render_scheduler_, tile_manager_);
+  path_trace_->set_progress(&progress);
+  path_trace_->tile_buffer_update_cb = [&]() {
+    if (!update_render_tile_cb) {
+      return;
+    }
+    update_render_tile_cb();
+  };
+  path_trace_->tile_buffer_write_cb = [&]() {
+    if (!write_render_tile_cb) {
+      return;
+    }
+    write_render_tile_cb();
+  };
+  path_trace_->tile_buffer_read_cb = [&]() -> bool {
+    if (!read_render_tile_cb) {
+      return false;
+    }
+    read_render_tile_cb();
+    return true;
+  };
+  path_trace_->progress_update_cb = [&]() { update_status_time(); };
 
-  /* Create buffers for interactive rendering. */
-  if (!(params.background && !params.write_render_cb)) {
-    buffers = new RenderBuffers(device);
-    display = new DisplayBuffer(device, params.display_buffer_linear);
-  }
+  tile_manager_.full_buffer_written_cb = [&](string_view filename) {
+    if (!full_buffer_written_cb) {
+      return;
+    }
+    full_buffer_written_cb(filename);
+  };
 }
 
 Session::~Session()
 {
   cancel();
 
+  /* TODO(sergey): Bring the passes in viewport back.
+   * It is unclear why there is such an exception needed though. */
+#if 0
   if (buffers && params.write_render_cb) {
     /* Copy to display buffer and write out image if requested */
     delete display;
@@ -116,12 +114,14 @@ Session::~Session()
     uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h);
     params.write_render_cb((uchar *)pixels, w, h, 4);
   }
+#endif
 
-  /* clean up */
-  tile_manager.device_free();
+  /* Make sure path tracer is destroyed before the deviec. This is needed because destruction might
+   * need to access device for device memory free. */
+  /* TODO(sergey): Convert device to be unique_ptr, and rely on C++ to destruct objects in the
+   * pre-defined order. */
+  path_trace_.reset();
 
-  delete buffers;
-  delete display;
   delete scene;
   delete device;
 
@@ -135,15 +135,16 @@ void Session::start()
   }
 }
 
-void Session::cancel()
+void Session::cancel(bool quick)
 {
+  if (quick && path_trace_) {
+    path_trace_->cancel();
+  }
+
   if (session_thread_) {
     /* wait for session thread to end */
     progress.set_cancel("Exiting");
 
-    gpu_need_display_buffer_update_ = false;
-    gpu_need_display_buffer_update_cond_.notify_all();
-
     {
       thread_scoped_lock pause_lock(pause_mutex_);
       pause_ = false;
@@ -157,570 +158,43 @@ void Session::cancel()
 
 bool Session::ready_to_reset()
 {
-  double dt = time_dt() - reset_time_;
-
-  if (!display_outdated_)
-    return (dt > params.reset_timeout);
-  else
-    return (dt > params.cancel_timeout);
+  return path_trace_->ready_to_reset();
 }
 
-/* GPU Session */
-
-void Session::reset_gpu(BufferParams &buffer_params, int samples)
+void Session::run_main_render_loop()
 {
-  thread_scoped_lock pause_lock(pause_mutex_);
-
-  /* block for buffer access and reset immediately. we can't do this
-   * in the thread, because we need to allocate an OpenGL buffer, and
-   * that only works in the main thread */
-  thread_scoped_lock display_lock(display_mutex_);
-  thread_scoped_lock buffers_lock(buffers_mutex_);
+  path_trace_->clear_gpu_display();
 
-  display_outdated_ = true;
-  reset_time_ = time_dt();
+  while (true) {
+    RenderWork render_work = run_update_for_next_iteration();
 
-  reset_(buffer_params, samples);
-
-  gpu_need_display_buffer_update_ = false;
-  gpu_need_display_buffer_update_cond_.notify_all();
-
-  new_work_added_ = true;
-
-  pause_cond_.notify_all();
-}
-
-bool Session::draw_gpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
-  /* block for buffer access */
-  thread_scoped_lock display_lock(display_mutex_);
-
-  /* first check we already rendered something */
-  if (gpu_draw_ready_) {
-    /* then verify the buffers have the expected size, so we don't
-     * draw previous results in a resized window */
-    if (buffer_params.width == display->params.width &&
-        buffer_params.height == display->params.height) {
-      /* for CUDA we need to do tone-mapping still, since we can
-       * only access GL buffers from the main thread. */
-      if (gpu_need_display_buffer_update_) {
-        thread_scoped_lock buffers_lock(buffers_mutex_);
-        copy_to_display_buffer(tile_manager.state.sample);
-        gpu_need_display_buffer_update_ = false;
-        gpu_need_display_buffer_update_cond_.notify_all();
+    if (!render_work) {
+      if (VLOG_IS_ON(2)) {
+        double total_time, render_time;
+        progress.get_time(total_time, render_time);
+        VLOG(2) << "Rendering in main loop is done in " << render_time << " seconds.";
+        VLOG(2) << path_trace_->full_report();
       }
 
-      display->draw(device, draw_params);
-
-      if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
-        return false;
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-void Session::run_gpu()
-{
-  bool tiles_written = false;
-
-  reset_time_ = time_dt();
-  last_update_time_ = time_dt();
-  last_display_time_ = last_update_time_;
-
-  progress.set_render_start_time();
-
-  while (!progress.get_cancel()) {
-    const bool no_tiles = !run_update_for_next_iteration();
-
-    if (no_tiles) {
       if (params.background) {
-        /* if no work left and in background mode, we can stop immediately */
+        /* if no work left and in background mode, we can stop immediately. */
         progress.set_status("Finished");
         break;
       }
     }
 
-    if (run_wait_for_work(no_tiles)) {
-      continue;
-    }
-
-    if (progress.get_cancel()) {
-      break;
-    }
-
-    if (!no_tiles) {
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      if (progress.get_cancel())
-        break;
-
-      /* buffers mutex is locked entirely while rendering each
-       * sample, and released/reacquired on each iteration to allow
-       * reset and draw in between */
-      thread_scoped_lock buffers_lock(buffers_mutex_);
-
-      /* update status and timing */
-      update_status_time();
-
-      /* render */
-      bool delayed_denoise = false;
-      const bool need_denoise = render_need_denoise(delayed_denoise);
-      render(need_denoise);
-
-      device->task_wait();
-
-      if (!device->error_message().empty())
-        progress.set_cancel(device->error_message());
-
-      /* update status and timing */
-      update_status_time();
-
-      gpu_need_display_buffer_update_ = !delayed_denoise;
-      gpu_draw_ready_ = true;
-      progress.set_update();
-
-      /* wait for until display buffer is updated */
-      if (!params.background) {
-        while (gpu_need_display_buffer_update_) {
-          if (progress.get_cancel())
-            break;
-
-          gpu_need_display_buffer_update_cond_.wait(buffers_lock);
-        }
-      }
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      tiles_written = update_progressive_refine(progress.get_cancel());
-
-      if (progress.get_cancel())
-        break;
-    }
-  }
-
-  if (!tiles_written)
-    update_progressive_refine(true);
-}
-
-/* CPU Session */
-
-void Session::reset_cpu(BufferParams &buffer_params, int samples)
-{
-  thread_scoped_lock reset_lock(delayed_reset_.mutex);
-  thread_scoped_lock pause_lock(pause_mutex_);
-
-  display_outdated_ = true;
-  reset_time_ = time_dt();
-
-  delayed_reset_.params = buffer_params;
-  delayed_reset_.samples = samples;
-  delayed_reset_.do_reset = true;
-  device->task_cancel();
-
-  pause_cond_.notify_all();
-}
-
-bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
-  thread_scoped_lock display_lock(display_mutex_);
-
-  /* first check we already rendered something */
-  if (display->draw_ready()) {
-    /* then verify the buffers have the expected size, so we don't
-     * draw previous results in a resized window */
-    if (buffer_params.width == display->params.width &&
-        buffer_params.height == display->params.height) {
-      display->draw(device, draw_params);
-
-      if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
-        return false;
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool Session::steal_tile(RenderTile &rtile, Device *tile_device, thread_scoped_lock &tile_lock)
-{
-  /* Devices that can get their tiles stolen don't steal tiles themselves.
-   * Additionally, if there are no stealable tiles in flight, give up here. */
-  if (tile_device->info.type == DEVICE_CPU || stealable_tiles_ == 0) {
-    return false;
-  }
-
-  /* Wait until no other thread is trying to steal a tile. */
-  while (tile_stealing_state_ != NOT_STEALING && stealable_tiles_ > 0) {
-    /* Someone else is currently trying to get a tile.
-     * Wait on the condition variable and try later. */
-    tile_steal_cond_.wait(tile_lock);
-  }
-  /* If another thread stole the last stealable tile in the meantime, give up. */
-  if (stealable_tiles_ == 0) {
-    return false;
-  }
-
-  /* There are stealable tiles in flight, so signal that one should be released. */
-  tile_stealing_state_ = WAITING_FOR_TILE;
-
-  /* Wait until a device notices the signal and releases its tile. */
-  while (tile_stealing_state_ != GOT_TILE && stealable_tiles_ > 0) {
-    tile_steal_cond_.wait(tile_lock);
-  }
-  /* If the last stealable tile finished on its own, give up. */
-  if (tile_stealing_state_ != GOT_TILE) {
-    tile_stealing_state_ = NOT_STEALING;
-    return false;
-  }
-
-  /* Successfully stole a tile, now move it to the new device. */
-  rtile = stolen_tile_;
-  rtile.buffers->buffer.move_device(tile_device);
-  rtile.buffer = rtile.buffers->buffer.device_pointer;
-  rtile.stealing_state = RenderTile::NO_STEALING;
-  rtile.num_samples -= (rtile.sample - rtile.start_sample);
-  rtile.start_sample = rtile.sample;
-
-  tile_stealing_state_ = NOT_STEALING;
-
-  /* Poke any threads which might be waiting for NOT_STEALING above. */
-  tile_steal_cond_.notify_one();
-
-  return true;
-}
-
-bool Session::get_tile_stolen()
-{
-  /* If tile_stealing_state is WAITING_FOR_TILE, atomically set it to RELEASING_TILE
-   * and return true. */
-  TileStealingState expected = WAITING_FOR_TILE;
-  return tile_stealing_state_.compare_exchange_weak(expected, RELEASING_TILE);
-}
-
-bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types)
-{
-  if (progress.get_cancel()) {
-    if (params.progressive_refine == false) {
-      /* for progressive refine current sample should be finished for all tiles */
-      return false;
-    }
-  }
-
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  /* get next tile from manager */
-  Tile *tile;
-  int device_num = device->device_number(tile_device);
-
-  while (!tile_manager.next_tile(tile, device_num, tile_types)) {
-    /* Can only steal tiles on devices that support rendering
-     * This is because denoising tiles cannot be stolen (see below)
-     */
-    if ((tile_types & (RenderTile::PATH_TRACE | RenderTile::BAKE)) &&
-        steal_tile(rtile, tile_device, tile_lock)) {
-      return true;
-    }
-
-    /* Wait for denoising tiles to become available */
-    if ((tile_types & RenderTile::DENOISE) && !progress.get_cancel() && tile_manager.has_tiles()) {
-      denoising_cond_.wait(tile_lock);
-      continue;
-    }
-
-    return false;
-  }
-
-  /* fill render tile */
-  rtile.x = tile_manager.state.buffer.full_x + tile->x;
-  rtile.y = tile_manager.state.buffer.full_y + tile->y;
-  rtile.w = tile->w;
-  rtile.h = tile->h;
-  rtile.start_sample = tile_manager.state.sample;
-  rtile.num_samples = tile_manager.state.num_samples;
-  rtile.resolution = tile_manager.state.resolution_divider;
-  rtile.tile_index = tile->index;
-  rtile.stealing_state = RenderTile::NO_STEALING;
-
-  if (tile->state == Tile::DENOISE) {
-    rtile.task = RenderTile::DENOISE;
-  }
-  else {
-    if (tile_device->info.type == DEVICE_CPU) {
-      stealable_tiles_++;
-      rtile.stealing_state = RenderTile::CAN_BE_STOLEN;
-    }
-
-    if (read_bake_tile_cb) {
-      rtile.task = RenderTile::BAKE;
-    }
-    else {
-      rtile.task = RenderTile::PATH_TRACE;
-    }
-  }
-
-  tile_lock.unlock();
-
-  /* in case of a permanent buffer, return it, otherwise we will allocate
-   * a new temporary buffer */
-  if (buffers) {
-    tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
-    rtile.buffer = buffers->buffer.device_pointer;
-    rtile.buffers = buffers;
-
-    device->map_tile(tile_device, rtile);
-
-    /* Reset copy state, since buffer contents change after the tile was acquired */
-    buffers->map_neighbor_copied = false;
-
-    /* This hack ensures that the copy in 'MultiDevice::map_neighbor_tiles' accounts
-     * for the buffer resolution divider. */
-    buffers->buffer.data_width = (buffers->params.width * buffers->params.get_passes_size()) /
-                                 tile_manager.state.resolution_divider;
-    buffers->buffer.data_height = buffers->params.height / tile_manager.state.resolution_divider;
-
-    return true;
-  }
-
-  if (tile->buffers == NULL) {
-    /* fill buffer parameters */
-    BufferParams buffer_params = tile_manager.params;
-    buffer_params.full_x = rtile.x;
-    buffer_params.full_y = rtile.y;
-    buffer_params.width = rtile.w;
-    buffer_params.height = rtile.h;
-
-    /* allocate buffers */
-    tile->buffers = new RenderBuffers(tile_device);
-    tile->buffers->reset(buffer_params);
-  }
-  else if (tile->buffers->buffer.device != tile_device) {
-    /* Move buffer to current tile device again in case it was stolen before.
-     * Not needed for denoising since that already handles mapping of tiles and
-     * neighbors to its own device. */
-    if (rtile.task != RenderTile::DENOISE) {
-      tile->buffers->buffer.move_device(tile_device);
-    }
-  }
-
-  tile->buffers->map_neighbor_copied = false;
-
-  tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
-  rtile.buffer = tile->buffers->buffer.device_pointer;
-  rtile.buffers = tile->buffers;
-  rtile.sample = tile_manager.state.sample;
-
-  if (read_bake_tile_cb) {
-    /* This will read any passes needed as input for baking. */
-    if (tile_manager.state.sample == tile_manager.range_start_sample) {
-      {
-        thread_scoped_lock tile_lock(tile_mutex_);
-        read_bake_tile_cb(rtile);
-      }
-      rtile.buffers->buffer.copy_to_device();
-    }
-  }
-  else {
-    /* This will tag tile as IN PROGRESS in blender-side render pipeline,
-     * which is needed to highlight currently rendering tile before first
-     * sample was processed for it. */
-    update_tile_sample(rtile);
-  }
-
-  return true;
-}
-
-void Session::update_tile_sample(RenderTile &rtile)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  if (update_render_tile_cb) {
-    if (params.progressive_refine == false) {
-      /* todo: optimize this by making it thread safe and removing lock */
-
-      update_render_tile_cb(rtile, true);
-    }
-  }
-
-  update_status_time();
-}
-
-void Session::release_tile(RenderTile &rtile, const bool need_denoise)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  if (rtile.stealing_state != RenderTile::NO_STEALING) {
-    stealable_tiles_--;
-    if (rtile.stealing_state == RenderTile::WAS_STOLEN) {
-      /* If the tile is being stolen, don't release it here - the new device will pick up where
-       * the old one left off. */
-
-      assert(tile_stealing_state_ == RELEASING_TILE);
-      assert(rtile.sample < rtile.start_sample + rtile.num_samples);
-
-      tile_stealing_state_ = GOT_TILE;
-      stolen_tile_ = rtile;
-      tile_steal_cond_.notify_all();
-      return;
-    }
-    else if (stealable_tiles_ == 0) {
-      /* If this was the last stealable tile, wake up any threads still waiting for one. */
-      tile_steal_cond_.notify_all();
-    }
-  }
-
-  progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
-
-  bool delete_tile;
-
-  if (tile_manager.finish_tile(rtile.tile_index, need_denoise, delete_tile)) {
-    /* Finished tile pixels write. */
-    if (write_render_tile_cb && params.progressive_refine == false) {
-      write_render_tile_cb(rtile);
-    }
-
-    if (delete_tile) {
-      delete rtile.buffers;
-      tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
-    }
-  }
-  else {
-    /* In progress tile pixels update. */
-    if (update_render_tile_cb && params.progressive_refine == false) {
-      update_render_tile_cb(rtile, false);
-    }
-  }
-
-  update_status_time();
-
-  /* Notify denoising thread that a tile was finished. */
-  denoising_cond_.notify_all();
-}
-
-void Session::map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  const int4 image_region = make_int4(
-      tile_manager.state.buffer.full_x,
-      tile_manager.state.buffer.full_y,
-      tile_manager.state.buffer.full_x + tile_manager.state.buffer.width,
-      tile_manager.state.buffer.full_y + tile_manager.state.buffer.height);
-
-  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-
-  if (!tile_manager.schedule_denoising) {
-    /* Fix up tile slices with overlap. */
-    if (tile_manager.slice_overlap != 0) {
-      int y = max(center_tile.y - tile_manager.slice_overlap, image_region.y);
-      center_tile.h = min(center_tile.y + center_tile.h + tile_manager.slice_overlap,
-                          image_region.w) -
-                      y;
-      center_tile.y = y;
-    }
-
-    /* Tiles are not being denoised individually, which means the entire image is processed. */
-    neighbors.set_bounds_from_center();
-  }
-  else {
-    int center_idx = center_tile.tile_index;
-    assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
-
-    for (int dy = -1, i = 0; dy <= 1; dy++) {
-      for (int dx = -1; dx <= 1; dx++, i++) {
-        RenderTile &rtile = neighbors.tiles[i];
-        int nindex = tile_manager.get_neighbor_index(center_idx, i);
-        if (nindex >= 0) {
-          Tile *tile = &tile_manager.state.tiles[nindex];
-
-          rtile.x = image_region.x + tile->x;
-          rtile.y = image_region.y + tile->y;
-          rtile.w = tile->w;
-          rtile.h = tile->h;
-
-          if (buffers) {
-            tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
-            rtile.buffer = buffers->buffer.device_pointer;
-            rtile.buffers = buffers;
-          }
-          else {
-            assert(tile->buffers);
-            tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
-            rtile.buffer = tile->buffers->buffer.device_pointer;
-            rtile.buffers = tile->buffers;
-          }
-        }
-        else {
-          int px = center_tile.x + dx * params.tile_size.x;
-          int py = center_tile.y + dy * params.tile_size.y;
-
-          rtile.x = clamp(px, image_region.x, image_region.z);
-          rtile.y = clamp(py, image_region.y, image_region.w);
-          rtile.w = rtile.h = 0;
-
-          rtile.buffer = (device_ptr)NULL;
-          rtile.buffers = NULL;
-        }
-      }
-    }
-  }
-
-  assert(center_tile.buffers);
-  device->map_neighbor_tiles(tile_device, neighbors);
-
-  /* The denoised result is written back to the original tile. */
-  neighbors.target = center_tile;
-}
-
-void Session::unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-  device->unmap_neighbor_tiles(tile_device, neighbors);
-}
-
-void Session::run_cpu()
-{
-  bool tiles_written = false;
-
-  last_update_time_ = time_dt();
-  last_display_time_ = last_update_time_;
-
-  while (!progress.get_cancel()) {
-    const bool no_tiles = !run_update_for_next_iteration();
-    bool need_copy_to_display_buffer = false;
-
-    if (no_tiles) {
-      if (params.background) {
-        /* if no work left and in background mode, we can stop immediately */
-        progress.set_status("Finished");
+    const bool did_cancel = progress.get_cancel();
+    if (did_cancel) {
+      render_scheduler_.render_work_reschedule_on_cancel(render_work);
+      if (!render_work) {
         break;
       }
     }
-
-    if (run_wait_for_work(no_tiles)) {
+    else if (run_wait_for_work(render_work)) {
       continue;
     }
 
-    if (progress.get_cancel()) {
-      break;
-    }
-
-    if (!no_tiles) {
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      if (progress.get_cancel())
-        break;
-
+    {
       /* buffers mutex is locked entirely while rendering each
        * sample, and released/reacquired on each iteration to allow
        * reset and draw in between */
@@ -730,49 +204,25 @@ void Session::run_cpu()
       update_status_time();
 
       /* render */
-      bool delayed_denoise = false;
-      const bool need_denoise = render_need_denoise(delayed_denoise);
-      render(need_denoise);
+      path_trace_->render(render_work);
 
       /* update status and timing */
       update_status_time();
 
-      if (!params.background)
-        need_copy_to_display_buffer = !delayed_denoise;
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-    }
-
-    device->task_wait();
-
-    {
-      thread_scoped_lock reset_lock(delayed_reset_.mutex);
-      thread_scoped_lock buffers_lock(buffers_mutex_);
-      thread_scoped_lock display_lock(display_mutex_);
-
-      if (delayed_reset_.do_reset) {
-        /* reset rendering if request from main thread */
-        delayed_reset_.do_reset = false;
-        reset_(delayed_reset_.params, delayed_reset_.samples);
-      }
-      else if (need_copy_to_display_buffer) {
-        /* Only copy to display_buffer if we do not reset, we don't
-         * want to show the result of an incomplete sample */
-        copy_to_display_buffer(tile_manager.state.sample);
+      if (device->have_error()) {
+        const string &error_message = device->error_message();
+        progress.set_error(error_message);
+        progress.set_cancel(error_message);
+        break;
       }
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      tiles_written = update_progressive_refine(progress.get_cancel());
     }
 
     progress.set_update();
-  }
 
-  if (!tiles_written)
-    update_progressive_refine(true);
+    if (did_cancel) {
+      break;
+    }
+  }
 }
 
 void Session::run()
@@ -789,10 +239,7 @@ void Session::run()
     /* reset number of rendered samples */
     progress.reset_sample();
 
-    if (device_use_gl_)
-      run_gpu();
-    else
-      run_cpu();
+    run_main_render_loop();
   }
 
   profiler.stop();
@@ -804,31 +251,92 @@ void Session::run()
     progress.set_update();
 }
 
-bool Session::run_update_for_next_iteration()
+RenderWork Session::run_update_for_next_iteration()
 {
+  RenderWork render_work;
+
   thread_scoped_lock scene_lock(scene->mutex);
   thread_scoped_lock reset_lock(delayed_reset_.mutex);
 
+  bool have_tiles = true;
+  bool switched_to_new_tile = false;
+
   if (delayed_reset_.do_reset) {
     thread_scoped_lock buffers_lock(buffers_mutex_);
-    reset_(delayed_reset_.params, delayed_reset_.samples);
-    delayed_reset_.do_reset = false;
+    do_delayed_reset();
+
+    /* After reset make sure the tile manager is at the first big tile. */
+    have_tiles = tile_manager_.next();
+    switched_to_new_tile = true;
+  }
+
+  /* Update number of samples in the integrator.
+   * Ideally this would need to happen once in `Session::set_samples()`, but the issue there is
+   * the initial configuration when Session is created where the `set_samples()` is not used. */
+  scene->integrator->set_aa_samples(params.samples);
+
+  /* Update denoiser settings. */
+  {
+    const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+    path_trace_->set_denoiser_params(denoise_params);
+  }
+
+  /* Update adaptive sampling. */
+  {
+    const AdaptiveSampling adaptive_sampling = scene->integrator->get_adaptive_sampling();
+    path_trace_->set_adaptive_sampling(adaptive_sampling);
   }
 
-  const bool have_tiles = tile_manager.next();
+  render_scheduler_.set_num_samples(params.samples);
+  render_scheduler_.set_time_limit(params.time_limit);
+
+  while (have_tiles) {
+    render_work = render_scheduler_.get_render_work();
+    if (render_work) {
+      break;
+    }
 
-  if (have_tiles) {
+    progress.add_finished_tile(false);
+
+    have_tiles = tile_manager_.next();
+    if (have_tiles) {
+      render_scheduler_.reset_for_next_tile();
+      switched_to_new_tile = true;
+    }
+  }
+
+  if (render_work) {
     scoped_timer update_timer;
-    if (update_scene()) {
+
+    if (switched_to_new_tile) {
+      BufferParams tile_params = buffer_params_;
+
+      const Tile &tile = tile_manager_.get_current_tile();
+      tile_params.width = tile.width;
+      tile_params.height = tile.height;
+      tile_params.full_x = tile.x + buffer_params_.full_x;
+      tile_params.full_y = tile.y + buffer_params_.full_y;
+      tile_params.full_width = buffer_params_.full_width;
+      tile_params.full_height = buffer_params_.full_height;
+      tile_params.update_offset_stride();
+
+      path_trace_->reset(buffer_params_, tile_params);
+    }
+
+    const int resolution = render_work.resolution_divider;
+    const int width = max(1, buffer_params_.full_width / resolution);
+    const int height = max(1, buffer_params_.full_height / resolution);
+
+    if (update_scene(width, height)) {
       profiler.reset(scene->shaders.size(), scene->objects.size());
     }
     progress.add_skip_time(update_timer, params.background);
   }
 
-  return have_tiles;
+  return render_work;
 }
 
-bool Session::run_wait_for_work(bool no_tiles)
+bool Session::run_wait_for_work(const RenderWork &render_work)
 {
   /* In an offline rendering there is no pause, and no tiles will mean the job is fully done. */
   if (params.background) {
@@ -837,19 +345,20 @@ bool Session::run_wait_for_work(bool no_tiles)
 
   thread_scoped_lock pause_lock(pause_mutex_);
 
-  if (!pause_ && !no_tiles) {
+  if (!pause_ && render_work) {
     /* Rendering is not paused and there is work to be done. No need to wait for anything. */
     return false;
   }
 
-  update_status_time(pause_, no_tiles);
+  const bool no_work = !render_work;
+  update_status_time(pause_, no_work);
 
   /* Only leave the loop when rendering is not paused. But even if the current render is un-paused
    * but there is nothing to render keep waiting until new work is added. */
   while (!cancel_) {
     scoped_timer pause_timer;
 
-    if (!pause_ && (!no_tiles || new_work_added_ || delayed_reset_.do_reset)) {
+    if (!pause_ && (render_work || new_work_added_ || delayed_reset_.do_reset)) {
       break;
     }
 
@@ -860,52 +369,88 @@ bool Session::run_wait_for_work(bool no_tiles)
       progress.add_skip_time(pause_timer, params.background);
     }
 
-    update_status_time(pause_, no_tiles);
+    update_status_time(pause_, no_work);
     progress.set_update();
   }
 
   new_work_added_ = false;
 
-  return no_tiles;
+  return no_work;
 }
 
-bool Session::draw(BufferParams &buffer_params, DeviceDrawParams &draw_params)
+void Session::draw()
 {
-  if (device_use_gl_)
-    return draw_gpu(buffer_params, draw_params);
-  else
-    return draw_cpu(buffer_params, draw_params);
+  path_trace_->draw();
 }
 
-void Session::reset_(BufferParams &buffer_params, int samples)
+int2 Session::get_effective_tile_size() const
 {
-  if (buffers && buffer_params.modified(tile_manager.params)) {
-    gpu_draw_ready_ = false;
-    buffers->reset(buffer_params);
-    if (display) {
-      display->reset(buffer_params);
-    }
+  /* No support yet for baking with tiles. */
+  if (!params.use_auto_tile || scene->bake_manager->get_baking()) {
+    return make_int2(buffer_params_.width, buffer_params_.height);
   }
 
-  tile_manager.reset(buffer_params, samples);
-  stealable_tiles_ = 0;
-  tile_stealing_state_ = NOT_STEALING;
-  progress.reset_sample();
+  /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile
+   * and prefer optimal performance. */
+
+  return make_int2(params.tile_size, params.tile_size);
+}
+
+void Session::do_delayed_reset()
+{
+  if (!delayed_reset_.do_reset) {
+    return;
+  }
+  delayed_reset_.do_reset = false;
+
+  params = delayed_reset_.session_params;
+  buffer_params_ = delayed_reset_.buffer_params;
+
+  /* Store parameters used for buffers access outside of scene graph.  */
+  buffer_params_.exposure = scene->film->get_exposure();
+  buffer_params_.use_approximate_shadow_catcher =
+      scene->film->get_use_approximate_shadow_catcher();
+  buffer_params_.use_transparent_background = scene->background->get_transparent();
 
-  bool show_progress = params.background || tile_manager.get_num_effective_samples() != INT_MAX;
-  progress.set_total_pixel_samples(show_progress ? tile_manager.state.total_pixel_samples : 0);
+  /* Tile and work scheduling. */
+  tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size());
+  render_scheduler_.reset(buffer_params_, params.samples);
 
-  if (!params.background)
+  /* Passes. */
+  /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial
+   * tile results. It is safe to use generic update function here which checks for changes since
+   * changes in tile settings re-creates session, which ensures film is fully updated on tile
+   * changes. */
+  scene->film->update_passes(scene, tile_manager_.has_multiple_tiles());
+
+  /* Update for new state of scene and passes. */
+  buffer_params_.update_passes(scene->passes);
+  tile_manager_.update(buffer_params_, scene);
+
+  /* Progress. */
+  progress.reset_sample();
+  progress.set_total_pixel_samples(buffer_params_.width * buffer_params_.height * params.samples);
+
+  if (!params.background) {
     progress.set_start_time();
+  }
   progress.set_render_start_time();
 }
 
-void Session::reset(BufferParams &buffer_params, int samples)
+void Session::reset(const SessionParams &session_params, const BufferParams &buffer_params)
 {
-  if (device_use_gl_)
-    reset_gpu(buffer_params, samples);
-  else
-    reset_cpu(buffer_params, samples);
+  {
+    thread_scoped_lock reset_lock(delayed_reset_.mutex);
+    thread_scoped_lock pause_lock(pause_mutex_);
+
+    delayed_reset_.do_reset = true;
+    delayed_reset_.session_params = session_params;
+    delayed_reset_.buffer_params = buffer_params;
+
+    path_trace_->cancel();
+  }
+
+  pause_cond_.notify_all();
 }
 
 void Session::set_samples(int samples)
@@ -915,7 +460,22 @@ void Session::set_samples(int samples)
   }
 
   params.samples = samples;
-  tile_manager.set_samples(samples);
+
+  {
+    thread_scoped_lock pause_lock(pause_mutex_);
+    new_work_added_ = true;
+  }
+
+  pause_cond_.notify_all();
+}
+
+void Session::set_time_limit(double time_limit)
+{
+  if (time_limit == params.time_limit) {
+    return;
+  }
+
+  params.time_limit = time_limit;
 
   {
     thread_scoped_lock pause_lock(pause_mutex_);
@@ -948,38 +508,9 @@ void Session::set_pause(bool pause)
   }
 }
 
-void Session::set_denoising(const DenoiseParams &denoising)
+void Session::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
 {
-  bool need_denoise = denoising.need_denoising_task();
-
-  /* Lock buffers so no denoising operation is triggered while the settings are changed here. */
-  thread_scoped_lock buffers_lock(buffers_mutex_);
-  params.denoising = denoising;
-
-  if (!(params.device.denoisers & denoising.type)) {
-    if (need_denoise) {
-      progress.set_error("Denoiser type not supported by compute device");
-    }
-
-    params.denoising.use = false;
-    need_denoise = false;
-  }
-
-  // TODO(pmours): Query the required overlap value for denoising from the device?
-  tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0;
-
-  /* Schedule per tile denoising for final renders if we are either denoising or
-   * need prefiltered passes for the native denoiser. */
-  tile_manager.schedule_denoising = need_denoise && !buffers;
-}
-
-void Session::set_denoising_start_sample(int sample)
-{
-  if (sample != params.denoising.start_sample) {
-    params.denoising.start_sample = sample;
-
-    pause_cond_.notify_all();
-  }
+  path_trace_->set_gpu_display(move(gpu_display));
 }
 
 void Session::wait()
@@ -989,81 +520,67 @@ void Session::wait()
     delete session_thread_;
   }
 
-  session_thread_ = NULL;
+  session_thread_ = nullptr;
 }
 
-bool Session::update_scene()
+bool Session::update_scene(int width, int height)
 {
-  /* update camera if dimensions changed for progressive render. the camera
+  /* Update camera if dimensions changed for progressive render. the camera
    * knows nothing about progressive or cropped rendering, it just gets the
-   * image dimensions passed in */
+   * image dimensions passed in. */
   Camera *cam = scene->camera;
-  int width = tile_manager.state.buffer.full_width;
-  int height = tile_manager.state.buffer.full_height;
-  int resolution = tile_manager.state.resolution_divider;
-
-  cam->set_screen_size_and_resolution(width, height, resolution);
+  cam->set_screen_size(width, height);
 
-  /* number of samples is needed by multi jittered
-   * sampling pattern and by baking */
-  Integrator *integrator = scene->integrator;
-  BakeManager *bake_manager = scene->bake_manager;
+  /* First detect which kernel features are used and allocate working memory.
+   * This helps estimate how may device memory is available for the scene and
+   * how much we need to allocate on the host instead. */
+  scene->update_kernel_features();
 
-  if (integrator->get_sampling_pattern() != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) {
-    integrator->set_aa_samples(tile_manager.num_samples);
-  }
+  path_trace_->load_kernels();
+  path_trace_->alloc_work_memory();
 
-  bool kernel_switch_needed = false;
-  if (scene->update(progress, kernel_switch_needed)) {
-    if (kernel_switch_needed) {
-      reset(tile_manager.params, params.samples);
-    }
+  if (scene->update(progress)) {
     return true;
   }
+
   return false;
 }
 
+static string status_append(const string &status, const string &suffix)
+{
+  string prefix = status;
+  if (!prefix.empty()) {
+    prefix += ", ";
+  }
+  return prefix + suffix;
+}
+
 void Session::update_status_time(bool show_pause, bool show_done)
 {
-  int progressive_sample = tile_manager.state.sample;
-  int num_samples = tile_manager.get_num_effective_samples();
+  string status, substatus;
 
-  int tile = progress.get_rendered_tiles();
-  int num_tiles = tile_manager.state.num_tiles;
+  const int current_tile = progress.get_rendered_tiles();
+  const int num_tiles = tile_manager_.get_num_tiles();
 
-  /* update status */
-  string status, substatus;
+  const int current_sample = progress.get_current_sample();
+  const int num_samples = render_scheduler_.get_num_samples();
 
-  if (!params.progressive) {
-    const bool is_cpu = params.device.type == DEVICE_CPU;
-    const bool rendering_finished = (tile == num_tiles);
-    const bool is_last_tile = (tile + 1) == num_tiles;
-
-    substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles);
-
-    if (!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
-      /* Some devices automatically support showing the sample number:
-       * - CUDADevice
-       * - OpenCLDevice when using the megakernel (the split kernel renders multiple
-       *   samples at the same time, so the current sample isn't really defined)
-       * - CPUDevice when using one thread
-       * For these devices, the current sample is always shown.
-       *
-       * The other option is when the last tile is currently being rendered by the CPU.
-       */
-      substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
-    }
-    if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) {
-      substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
-    }
-    else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) {
-      substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles());
-    }
+  /* TIle. */
+  if (tile_manager_.has_multiple_tiles()) {
+    substatus = status_append(substatus,
+                              string_printf("Rendered %d/%d Tiles", current_tile, num_tiles));
   }
-  else if (tile_manager.num_samples == Integrator::MAX_SAMPLES)
-    substatus = string_printf("Path Tracing Sample %d", progressive_sample + 1);
-  else
-    substatus = string_printf("Path Tracing Sample %d/%d", progressive_sample + 1, num_samples);
+
+  /* Sample. */
+  if (num_samples == Integrator::MAX_SAMPLES) {
+    substatus = status_append(substatus, string_printf("Sample %d", current_sample));
+  }
+  else {
+    substatus = status_append(substatus,
+                              string_printf("Sample %d/%d", current_sample, num_samples));
+  }
+
+  /* TODO(sergey): Denoising status from the path trace. */
 
   if (show_pause) {
     status = "Rendering Paused";
@@ -1080,210 +597,122 @@ void Session::update_status_time(bool show_pause, bool show_done)
   progress.set_status(status, substatus);
 }
 
-bool Session::render_need_denoise(bool &delayed)
+void Session::device_free()
 {
-  delayed = false;
-
-  /* Not supported yet for baking. */
-  if (read_bake_tile_cb) {
-    return false;
-  }
-
-  /* Denoising enabled? */
-  if (!params.denoising.need_denoising_task()) {
-    return false;
-  }
-
-  if (params.background) {
-    /* Background render, only denoise when rendering the last sample. */
-    return tile_manager.done();
-  }
-
-  /* Viewport render. */
-
-  /* It can happen that denoising was already enabled, but the scene still needs an update. */
-  if (scene->film->is_modified() || !scene->film->get_denoising_data_offset()) {
-    return false;
-  }
+  scene->device_free();
+  path_trace_->device_free();
+}
 
-  /* Immediately denoise when we reach the start sample or last sample. */
-  const int num_samples_finished = tile_manager.state.sample + 1;
-  if (num_samples_finished == params.denoising.start_sample ||
-      num_samples_finished == params.samples) {
-    return true;
+void Session::collect_statistics(RenderStats *render_stats)
+{
+  scene->collect_statistics(render_stats);
+  if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
+    render_stats->collect_profiling(scene, profiler);
   }
+}
 
-  /* Do not denoise until the sample at which denoising should start is reached. */
-  if (num_samples_finished < params.denoising.start_sample) {
-    return false;
-  }
+/* --------------------------------------------------------------------
+ * Tile and tile pixels aceess.
+ */
 
-  /* Avoid excessive denoising in viewport after reaching a certain amount of samples. */
-  delayed = (tile_manager.state.sample >= 20 &&
-             (time_dt() - last_display_time_) < params.progressive_update_timeout);
-  return !delayed;
+bool Session::has_multiple_render_tiles() const
+{
+  return tile_manager_.has_multiple_tiles();
 }
 
-void Session::render(bool need_denoise)
+int2 Session::get_render_tile_size() const
 {
-  if (buffers && tile_manager.state.sample == tile_manager.range_start_sample) {
-    /* Clear buffers. */
-    buffers->zero();
-  }
-
-  if (tile_manager.state.buffer.width == 0 || tile_manager.state.buffer.height == 0) {
-    return; /* Avoid empty launches. */
-  }
+  return path_trace_->get_render_tile_size();
+}
 
-  /* Add path trace task. */
-  DeviceTask task(DeviceTask::RENDER);
-
-  task.acquire_tile = function_bind(&Session::acquire_tile, this, _2, _1, _3);
-  task.release_tile = function_bind(&Session::release_tile, this, _1, need_denoise);
-  task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
-  task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
-  task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
-  task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
-  task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
-  task.get_tile_stolen = function_bind(&Session::get_tile_stolen, this);
-  task.need_finish_queue = params.progressive_refine;
-  task.integrator_branched = scene->integrator->get_method() == Integrator::BRANCHED_PATH;
-
-  task.adaptive_sampling.use = (scene->integrator->get_sampling_pattern() ==
-                                SAMPLING_PATTERN_PMJ) &&
-                               scene->dscene.data.film.pass_adaptive_aux_buffer;
-  task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
-  task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step;
-
-  /* Acquire render tiles by default. */
-  task.tile_types = RenderTile::PATH_TRACE;
-
-  if (need_denoise) {
-    task.denoising = params.denoising;
-
-    task.pass_stride = scene->film->get_pass_stride();
-    task.target_pass_stride = task.pass_stride;
-    task.pass_denoising_data = scene->film->get_denoising_data_offset();
-    task.pass_denoising_clean = scene->film->get_denoising_clean_offset();
-
-    task.denoising_from_render = true;
-
-    if (tile_manager.schedule_denoising) {
-      /* Acquire denoising tiles during rendering. */
-      task.tile_types |= RenderTile::DENOISE;
-    }
-    else {
-      assert(buffers);
-
-      /* Schedule rendering and wait for it to finish. */
-      device->task_add(task);
-      device->task_wait();
-
-      /* Then run denoising on the whole image at once. */
-      task.type = DeviceTask::DENOISE_BUFFER;
-      task.x = tile_manager.state.buffer.full_x;
-      task.y = tile_manager.state.buffer.full_y;
-      task.w = tile_manager.state.buffer.width;
-      task.h = tile_manager.state.buffer.height;
-      task.buffer = buffers->buffer.device_pointer;
-      task.sample = tile_manager.state.sample;
-      task.num_samples = tile_manager.state.num_samples;
-      tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
-      task.buffers = buffers;
-    }
-  }
+int2 Session::get_render_tile_offset() const
+{
+  return path_trace_->get_render_tile_offset();
+}
 
-  device->task_add(task);
+string_view Session::get_render_tile_layer() const
+{
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+  return buffer_params.layer;
 }
 
-void Session::copy_to_display_buffer(int sample)
+string_view Session::get_render_tile_view() const
 {
-  /* add film conversion task */
-  DeviceTask task(DeviceTask::FILM_CONVERT);
-
-  task.x = tile_manager.state.buffer.full_x;
-  task.y = tile_manager.state.buffer.full_y;
-  task.w = tile_manager.state.buffer.width;
-  task.h = tile_manager.state.buffer.height;
-  task.rgba_byte = display->rgba_byte.device_pointer;
-  task.rgba_half = display->rgba_half.device_pointer;
-  task.buffer = buffers->buffer.device_pointer;
-  task.sample = sample;
-  tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
-
-  if (task.w > 0 && task.h > 0) {
-    device->task_add(task);
-    device->task_wait();
-
-    /* set display to new size */
-    display->draw_set(task.w, task.h);
-
-    last_display_time_ = time_dt();
-  }
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+  return buffer_params.view;
+}
 
-  display_outdated_ = false;
+bool Session::copy_render_tile_from_device()
+{
+  return path_trace_->copy_render_tile_from_device();
 }
 
-bool Session::update_progressive_refine(bool cancel)
+bool Session::get_render_tile_pixels(const string &pass_name, int num_components, float *pixels)
 {
-  int sample = tile_manager.state.sample + 1;
-  bool write = sample == tile_manager.num_samples || cancel;
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happenning while this function runs. */
 
-  double current_time = time_dt();
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
 
-  if (current_time - last_update_time_ < params.progressive_update_timeout) {
-    /* If last sample was processed, we need to write buffers anyway. */
-    if (!write && sample != 1)
-      return false;
+  const BufferPass *pass = buffer_params.find_pass(pass_name);
+  if (pass == nullptr) {
+    return false;
   }
 
-  if (params.progressive_refine) {
-    foreach (Tile &tile, tile_manager.state.tiles) {
-      if (!tile.buffers) {
-        continue;
-      }
-
-      RenderTile rtile;
-      rtile.x = tile_manager.state.buffer.full_x + tile.x;
-      rtile.y = tile_manager.state.buffer.full_y + tile.y;
-      rtile.w = tile.w;
-      rtile.h = tile.h;
-      rtile.sample = sample;
-      rtile.buffers = tile.buffers;
-
-      if (write) {
-        if (write_render_tile_cb)
-          write_render_tile_cb(rtile);
-      }
-      else {
-        if (update_render_tile_cb)
-          update_render_tile_cb(rtile, true);
-      }
+  const bool has_denoised_result = path_trace_->has_denoised_result();
+  if (pass->mode == PassMode::DENOISED && !has_denoised_result) {
+    pass = buffer_params.find_pass(pass->type);
+    if (pass == nullptr) {
+      /* Happens when denoised result pass is requested but is never written by the kernel. */
+      return false;
     }
   }
 
-  last_update_time_ = current_time;
+  pass = buffer_params.get_actual_display_pass(pass);
+
+  const float exposure = buffer_params.exposure;
+  const int num_samples = path_trace_->get_num_render_tile_samples();
 
-  return write;
+  PassAccessor::PassAccessInfo pass_access_info(*pass);
+  pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background;
+
+  const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  const PassAccessor::Destination destination(pixels, num_components);
+
+  return path_trace_->get_render_tile_pixels(pass_accessor, destination);
 }
 
-void Session::device_free()
+bool Session::set_render_tile_pixels(const string &pass_name,
+                                     int num_components,
+                                     const float *pixels)
 {
-  scene->device_free();
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happenning while this function runs. */
+
+  const BufferPass *pass = buffer_params_.find_pass(pass_name);
+  if (!pass) {
+    return false;
+  }
+
+  const float exposure = scene->film->get_exposure();
+  const int num_samples = render_scheduler_.get_num_rendered_samples();
 
-  tile_manager.device_free();
+  const PassAccessor::PassAccessInfo pass_access_info(*pass);
+  PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  PassAccessor::Source source(pixels, num_components);
 
-  /* used from background render only, so no need to
-   * re-create render/display buffers here
-   */
+  return path_trace_->set_render_tile_pixels(pass_accessor, source);
 }
 
-void Session::collect_statistics(RenderStats *render_stats)
+/* --------------------------------------------------------------------
+ * Full-frame on-disk storage.
+ */
+
+void Session::process_full_buffer_from_disk(string_view filename)
 {
-  scene->collect_statistics(render_stats);
-  if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
-    render_stats->collect_profiling(scene, profiler);
-  }
+  path_trace_->process_full_buffer_from_disk(filename);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 05025c10f9c..492cfdd1c09 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -18,6 +18,7 @@
 #define __SESSION_H__
 
 #include "device/device.h"
+#include "integrator/render_scheduler.h"
 #include "render/buffers.h"
 #include "render/shader.h"
 #include "render/stats.h"
@@ -26,6 +27,7 @@
 #include "util/util_progress.h"
 #include "util/util_stats.h"
 #include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -33,41 +35,35 @@ CCL_NAMESPACE_BEGIN
 class BufferParams;
 class Device;
 class DeviceScene;
-class DeviceRequestedFeatures;
-class DisplayBuffer;
+class PathTrace;
 class Progress;
+class GPUDisplay;
 class RenderBuffers;
 class Scene;
+class SceneParams;
 
 /* Session Parameters */
 
 class SessionParams {
  public:
   DeviceInfo device;
+
+  bool headless;
   bool background;
-  bool progressive_refine;
 
-  bool progressive;
   bool experimental;
   int samples;
-  int2 tile_size;
-  TileOrder tile_order;
-  int start_resolution;
-  int denoising_start_sample;
   int pixel_size;
   int threads;
-  bool adaptive_sampling;
-
-  bool use_profiling;
 
-  bool display_buffer_linear;
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit;
 
-  DenoiseParams denoising;
+  bool use_profiling;
 
-  double cancel_timeout;
-  double reset_timeout;
-  double text_timeout;
-  double progressive_update_timeout;
+  bool use_auto_tile;
+  int tile_size;
 
   ShadingSystem shadingsystem;
 
@@ -75,50 +71,32 @@ class SessionParams {
 
   SessionParams()
   {
+    headless = false;
     background = false;
-    progressive_refine = false;
 
-    progressive = false;
     experimental = false;
     samples = 1024;
-    tile_size = make_int2(64, 64);
-    start_resolution = INT_MAX;
-    denoising_start_sample = 0;
     pixel_size = 1;
     threads = 0;
-    adaptive_sampling = false;
+    time_limit = 0.0;
 
     use_profiling = false;
 
-    display_buffer_linear = false;
-
-    cancel_timeout = 0.1;
-    reset_timeout = 0.1;
-    text_timeout = 1.0;
-    progressive_update_timeout = 1.0;
+    use_auto_tile = true;
+    tile_size = 2048;
 
     shadingsystem = SHADINGSYSTEM_SVM;
-    tile_order = TILE_CENTER;
   }
 
-  bool modified(const SessionParams &params)
+  bool modified(const SessionParams &params) const
   {
     /* Modified means we have to recreate the session, any parameter changes
      * that can be handled by an existing Session are omitted. */
-    return !(device == params.device && background == params.background &&
-             progressive_refine == params.progressive_refine &&
-             progressive == params.progressive && experimental == params.experimental &&
-             tile_size == params.tile_size && start_resolution == params.start_resolution &&
+    return !(device == params.device && headless == params.headless &&
+             background == params.background && experimental == params.experimental &&
              pixel_size == params.pixel_size && threads == params.threads &&
-             adaptive_sampling == params.adaptive_sampling &&
-             use_profiling == params.use_profiling &&
-             display_buffer_linear == params.display_buffer_linear &&
-             cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
-             text_timeout == params.text_timeout &&
-             progressive_update_timeout == params.progressive_update_timeout &&
-             tile_order == params.tile_order && shadingsystem == params.shadingsystem &&
-             denoising.type == params.denoising.type &&
-             (denoising.use == params.denoising.use || (device.denoisers & denoising.type)));
+             use_profiling == params.use_profiling && shadingsystem == params.shadingsystem &&
+             use_auto_tile == params.use_auto_tile && tile_size == params.tile_size);
   }
 };
 
@@ -131,34 +109,41 @@ class Session {
  public:
   Device *device;
   Scene *scene;
-  RenderBuffers *buffers;
-  DisplayBuffer *display;
   Progress progress;
   SessionParams params;
-  TileManager tile_manager;
   Stats stats;
   Profiler profiler;
 
-  function<void(RenderTile &)> write_render_tile_cb;
-  function<void(RenderTile &, bool)> update_render_tile_cb;
-  function<void(RenderTile &)> read_bake_tile_cb;
+  function<void(void)> write_render_tile_cb;
+  function<void(void)> update_render_tile_cb;
+  function<void(void)> read_render_tile_cb;
+
+  /* Callback is invoked by tile manager whenever on-dist tiles storage file is closed after
+   * writing. Allows an engine integration to keep track of those files without worry about
+   * transfering the information when it needs to re-create session during rendering. */
+  function<void(string_view)> full_buffer_written_cb;
 
-  explicit Session(const SessionParams &params);
+  explicit Session(const SessionParams &params, const SceneParams &scene_params);
   ~Session();
 
   void start();
-  void cancel();
-  bool draw(BufferParams &params, DeviceDrawParams &draw_params);
+
+  /* When quick cancel is requested path tracing is cancelles as soon as possible, without waiting
+   * for the buffer to be uniformly sampled. */
+  void cancel(bool quick = false);
+
+  void draw();
   void wait();
 
   bool ready_to_reset();
-  void reset(BufferParams &params, int samples);
+  void reset(const SessionParams &session_params, const BufferParams &buffer_params);
+
   void set_pause(bool pause);
+
   void set_samples(int samples);
-  void set_denoising(const DenoiseParams &denoising);
-  void set_denoising_start_sample(int sample);
+  void set_time_limit(double time_limit);
 
-  bool update_scene();
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
 
   void device_free();
 
@@ -168,83 +153,95 @@ class Session {
 
   void collect_statistics(RenderStats *stats);
 
- protected:
-  struct DelayedReset {
-    thread_mutex mutex;
-    bool do_reset;
-    BufferParams params;
-    int samples;
-  } delayed_reset_;
+  /* --------------------------------------------------------------------
+   * Tile and tile pixels aceess.
+   */
 
-  void run();
+  bool has_multiple_render_tiles() const;
 
-  bool run_update_for_next_iteration();
-  bool run_wait_for_work(bool no_tiles);
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
 
-  void update_status_time(bool show_pause = false, bool show_done = false);
+  string_view get_render_tile_layer() const;
+  string_view get_render_tile_view() const;
 
-  void render(bool use_denoise);
-  void copy_to_display_buffer(int sample);
+  bool copy_render_tile_from_device();
 
-  void reset_(BufferParams &params, int samples);
+  bool get_render_tile_pixels(const string &pass_name, int num_components, float *pixels);
+  bool set_render_tile_pixels(const string &pass_name, int num_components, const float *pixels);
 
-  void run_cpu();
-  bool draw_cpu(BufferParams &params, DeviceDrawParams &draw_params);
-  void reset_cpu(BufferParams &params, int samples);
+  /* --------------------------------------------------------------------
+   * Full-frame on-disk storage.
+   */
 
-  void run_gpu();
-  bool draw_gpu(BufferParams &params, DeviceDrawParams &draw_params);
-  void reset_gpu(BufferParams &params, int samples);
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
 
-  bool render_need_denoise(bool &delayed);
+ protected:
+  struct DelayedReset {
+    thread_mutex mutex;
+    bool do_reset;
+    SessionParams session_params;
+    BufferParams buffer_params;
+  } delayed_reset_;
 
-  bool steal_tile(RenderTile &tile, Device *tile_device, thread_scoped_lock &tile_lock);
-  bool get_tile_stolen();
-  bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types);
-  void update_tile_sample(RenderTile &tile);
-  void release_tile(RenderTile &tile, const bool need_denoise);
+  void run();
 
-  void map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
-  void unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
+  /* Update for the new iteration of the main loop in run implementation (run_cpu and run_gpu).
+   *
+   * Will take care of the following things:
+   *  - Delayed reset
+   *  - Scene update
+   *  - Tile manager advance
+   *  - Render scheduler work request
+   *
+   * The updates are done in a proper order with proper locking around them, which guarantees
+   * that the device side of scene and render buffers are always in a consistent state.
+   *
+   * Returns render work which is to be rendered next. */
+  RenderWork run_update_for_next_iteration();
+
+  /* Wait for rendering to be unpaused, or for new tiles for render to arrive.
+   * Returns true if new main render loop iteration is required after this function call.
+   *
+   * The `render_work` is the work which was scheduled by the render scheduler right before
+   * checking the pause. */
+  bool run_wait_for_work(const RenderWork &render_work);
+
+  void run_main_render_loop();
+
+  bool update_scene(int width, int height);
 
-  bool device_use_gl_;
+  void update_status_time(bool show_pause = false, bool show_done = false);
 
-  thread *session_thread_;
+  void do_delayed_reset();
 
-  volatile bool display_outdated_;
+  int2 get_effective_tile_size() const;
 
-  volatile bool gpu_draw_ready_;
-  volatile bool gpu_need_display_buffer_update_;
-  thread_condition_variable gpu_need_display_buffer_update_cond_;
+  thread *session_thread_;
 
-  bool pause_;
-  bool cancel_;
-  bool new_work_added_;
+  bool pause_ = false;
+  bool cancel_ = false;
+  bool new_work_added_ = false;
 
   thread_condition_variable pause_cond_;
   thread_mutex pause_mutex_;
   thread_mutex tile_mutex_;
   thread_mutex buffers_mutex_;
-  thread_mutex display_mutex_;
-  thread_condition_variable denoising_cond_;
-  thread_condition_variable tile_steal_cond_;
-
-  double reset_time_;
-  double last_update_time_;
-  double last_display_time_;
-
-  RenderTile stolen_tile_;
-  typedef enum {
-    NOT_STEALING,     /* There currently is no tile stealing in progress. */
-    WAITING_FOR_TILE, /* A device is waiting for another device to release a tile. */
-    RELEASING_TILE,   /* A device has releasing a stealable tile. */
-    GOT_TILE /* A device has released a stealable tile, which is now stored in stolen_tile. */
-  } TileStealingState;
-  std::atomic<TileStealingState> tile_stealing_state_;
-  int stealable_tiles_;
-
-  /* progressive refine */
-  bool update_progressive_refine(bool cancel);
+
+  TileManager tile_manager_;
+  BufferParams buffer_params_;
+
+  /* Render scheduler is used to get work to be rendered with the current big tile. */
+  RenderScheduler render_scheduler_;
+
+  /* Path tracer object.
+   *
+   * Is a single full-frame path tracer for interactive viewport rendering.
+   * A path tracer for the current big-tile for an offline rendering. */
+  unique_ptr<PathTrace> path_trace_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 59b60904746..f6b23606e58 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -203,6 +203,7 @@ Shader::Shader() : Node(get_node_type())
   has_surface = false;
   has_surface_transparent = false;
   has_surface_emission = false;
+  has_surface_raytrace = false;
   has_surface_bssrdf = false;
   has_volume = false;
   has_displacement = false;
@@ -485,7 +486,7 @@ void ShaderManager::device_update(Device *device,
   device_update_specific(device, dscene, scene, progress);
 }
 
-void ShaderManager::device_update_common(Device *device,
+void ShaderManager::device_update_common(Device * /*device*/,
                                          DeviceScene *dscene,
                                          Scene *scene,
                                          Progress & /*progress*/)
@@ -508,6 +509,8 @@ void ShaderManager::device_update_common(Device *device,
       flag |= SD_HAS_EMISSION;
     if (shader->has_surface_transparent && shader->get_use_transparent_shadow())
       flag |= SD_HAS_TRANSPARENT_SHADOW;
+    if (shader->has_surface_raytrace)
+      flag |= SD_HAS_RAYTRACE;
     if (shader->has_volume) {
       flag |= SD_HAS_VOLUME;
       has_volumes = true;
@@ -528,12 +531,10 @@ void ShaderManager::device_update_common(Device *device,
       flag |= SD_NEED_VOLUME_ATTRIBUTES;
     if (shader->has_bssrdf_bump)
       flag |= SD_HAS_BSSRDF_BUMP;
-    if (device->info.has_volume_decoupled) {
-      if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
-        flag |= SD_VOLUME_EQUIANGULAR;
-      if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
-        flag |= SD_VOLUME_MIS;
-    }
+    if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
+      flag |= SD_VOLUME_EQUIANGULAR;
+    if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
+      flag |= SD_VOLUME_MIS;
     if (shader->get_volume_interpolation_method() == VOLUME_INTERPOLATION_CUBIC)
       flag |= SD_VOLUME_CUBIC;
     if (shader->has_bump)
@@ -682,39 +683,35 @@ void ShaderManager::add_default(Scene *scene)
   }
 }
 
-void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
-                                                 DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_graph_kernel_features(ShaderGraph *graph)
 {
+  uint kernel_features = 0;
+
   foreach (ShaderNode *node, graph->nodes) {
-    requested_features->max_nodes_group = max(requested_features->max_nodes_group,
-                                              node->get_group());
-    requested_features->nodes_features |= node->get_feature();
+    kernel_features |= node->get_feature();
     if (node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
       BsdfBaseNode *bsdf_node = static_cast<BsdfBaseNode *>(node);
       if (CLOSURE_IS_VOLUME(bsdf_node->get_closure_type())) {
-        requested_features->nodes_features |= NODE_FEATURE_VOLUME;
+        kernel_features |= KERNEL_FEATURE_NODE_VOLUME;
       }
       else if (CLOSURE_IS_PRINCIPLED(bsdf_node->get_closure_type())) {
-        requested_features->use_principled = true;
+        kernel_features |= KERNEL_FEATURE_PRINCIPLED;
       }
     }
     if (node->has_surface_bssrdf()) {
-      requested_features->use_subsurface = true;
+      kernel_features |= KERNEL_FEATURE_SUBSURFACE;
     }
     if (node->has_surface_transparent()) {
-      requested_features->use_transparent = true;
-    }
-    if (node->has_raytrace()) {
-      requested_features->use_shader_raytrace = true;
+      kernel_features |= KERNEL_FEATURE_TRANSPARENT;
     }
   }
+
+  return kernel_features;
 }
 
-void ShaderManager::get_requested_features(Scene *scene,
-                                           DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_kernel_features(Scene *scene)
 {
-  requested_features->max_nodes_group = NODE_GROUP_LEVEL_0;
-  requested_features->nodes_features = 0;
+  uint kernel_features = KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION;
   for (int i = 0; i < scene->shaders.size(); i++) {
     Shader *shader = scene->shaders[i];
     if (!shader->reference_count()) {
@@ -722,21 +719,22 @@ void ShaderManager::get_requested_features(Scene *scene,
     }
 
     /* Gather requested features from all the nodes from the graph nodes. */
-    get_requested_graph_features(shader->graph, requested_features);
+    kernel_features |= get_graph_kernel_features(shader->graph);
     ShaderNode *output_node = shader->graph->output();
     if (output_node->input("Displacement")->link != NULL) {
-      requested_features->nodes_features |= NODE_FEATURE_BUMP;
+      kernel_features |= KERNEL_FEATURE_NODE_BUMP;
       if (shader->get_displacement_method() == DISPLACE_BOTH) {
-        requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE;
-        requested_features->max_nodes_group = max(requested_features->max_nodes_group,
-                                                  NODE_GROUP_LEVEL_1);
+        kernel_features |= KERNEL_FEATURE_NODE_BUMP_STATE;
       }
     }
     /* On top of volume nodes, also check if we need volume sampling because
-     * e.g. an Emission node would slip through the NODE_FEATURE_VOLUME check */
-    if (shader->has_volume)
-      requested_features->use_volume |= true;
+     * e.g. an Emission node would slip through the KERNEL_FEATURE_NODE_VOLUME check */
+    if (shader->has_volume) {
+      kernel_features |= KERNEL_FEATURE_VOLUME;
+    }
   }
+
+  return kernel_features;
 }
 
 void ShaderManager::free_memory()
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index c65cac351a4..5f9adea3949 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 class DeviceScene;
-class DeviceRequestedFeatures;
 class Mesh;
 class Progress;
 class Scene;
@@ -117,6 +116,7 @@ class Shader : public Node {
   bool has_surface;
   bool has_surface_emission;
   bool has_surface_transparent;
+  bool has_surface_raytrace;
   bool has_volume;
   bool has_displacement;
   bool has_surface_bssrdf;
@@ -216,7 +216,7 @@ class ShaderManager {
   static void add_default(Scene *scene);
 
   /* Selective nodes compilation. */
-  void get_requested_features(Scene *scene, DeviceRequestedFeatures *requested_features);
+  uint get_kernel_features(Scene *scene);
 
   static void free_memory();
 
@@ -244,8 +244,7 @@ class ShaderManager {
 
   size_t beckmann_table_offset;
 
-  void get_requested_graph_features(ShaderGraph *graph,
-                                    DeviceRequestedFeatures *requested_features);
+  uint get_graph_kernel_features(ShaderGraph *graph);
 
   thread_spin_lock attribute_lock_;
 
diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp
index 2c6273842e2..73eb7e21ff9 100644
--- a/intern/cycles/render/stats.cpp
+++ b/intern/cycles/render/stats.cpp
@@ -264,53 +264,34 @@ void RenderStats::collect_profiling(Scene *scene, Profiler &prof)
   has_profiling = true;
 
   kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN));
-
   kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP));
-  kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT));
-
-  NamedNestedSampleStats &integrator = kernel.add_entry("Path integration",
-                                                        prof.get_event(PROFILING_PATH_INTEGRATE));
-  integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT));
-  integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION));
-  integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME));
-
-  NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0);
-  shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP));
-  shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL));
-  shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY));
-  shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO));
-  shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE));
-
-  integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT));
-  integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE));
-
-  NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0);
-  intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT));
-  intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL));
-  intersection.add_entry("Shadow All Intersection",
-                         prof.get_event(PROFILING_INTERSECT_SHADOW_ALL));
-  intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME));
-  intersection.add_entry("Volume All Intersection",
-                         prof.get_event(PROFILING_INTERSECT_VOLUME_ALL));
-
-  NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0);
-  closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL));
-  closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE));
-  closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL));
-  closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE));
-
-  NamedNestedSampleStats &denoising = kernel.add_entry("Denoising",
-                                                       prof.get_event(PROFILING_DENOISING));
-  denoising.add_entry("Construct Transform",
-                      prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM));
-  denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT));
-
-  NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0);
-  prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW));
-  prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS));
-  prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE));
-  prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS));
-  prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES));
+  kernel.add_entry("Intersect Closest", prof.get_event(PROFILING_INTERSECT_CLOSEST));
+  kernel.add_entry("Intersect Shadow", prof.get_event(PROFILING_INTERSECT_SHADOW));
+  kernel.add_entry("Intersect Subsurface", prof.get_event(PROFILING_INTERSECT_SUBSURFACE));
+  kernel.add_entry("Intersect Volume Stack", prof.get_event(PROFILING_INTERSECT_VOLUME_STACK));
+
+  NamedNestedSampleStats &surface = kernel.add_entry("Shade Surface", 0);
+  surface.add_entry("Setup", prof.get_event(PROFILING_SHADE_SURFACE_SETUP));
+  surface.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_SURFACE_EVAL));
+  surface.add_entry("Render Passes", prof.get_event(PROFILING_SHADE_SURFACE_PASSES));
+  surface.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_SURFACE_DIRECT_LIGHT));
+  surface.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT));
+  surface.add_entry("Ambient Occlusion", prof.get_event(PROFILING_SHADE_SURFACE_AO));
+
+  NamedNestedSampleStats &volume = kernel.add_entry("Shade Volume", 0);
+  volume.add_entry("Setup", prof.get_event(PROFILING_SHADE_VOLUME_SETUP));
+  volume.add_entry("Integrate", prof.get_event(PROFILING_SHADE_VOLUME_INTEGRATE));
+  volume.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_VOLUME_DIRECT_LIGHT));
+  volume.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_VOLUME_INDIRECT_LIGHT));
+
+  NamedNestedSampleStats &shadow = kernel.add_entry("Shade Shadow", 0);
+  shadow.add_entry("Setup", prof.get_event(PROFILING_SHADE_SHADOW_SETUP));
+  shadow.add_entry("Surface", prof.get_event(PROFILING_SHADE_SHADOW_SURFACE));
+  shadow.add_entry("Volume", prof.get_event(PROFILING_SHADE_SHADOW_VOLUME));
+
+  NamedNestedSampleStats &light = kernel.add_entry("Shade Light", 0);
+  light.add_entry("Setup", prof.get_event(PROFILING_SHADE_LIGHT_SETUP));
+  light.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_LIGHT_EVAL));
 
   shaders.entries.clear();
   foreach (Shader *shader, scene->shaders) {
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index dcb3976e15c..2379eb775a0 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -446,6 +446,8 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet &done)
   if (current_type == SHADER_TYPE_SURFACE) {
     if (node->has_spatial_varying())
       current_shader->has_surface_spatial_varying = true;
+    if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+      current_shader->has_surface_raytrace = true;
   }
   else if (current_type == SHADER_TYPE_VOLUME) {
     if (node->has_spatial_varying())
@@ -492,6 +494,13 @@ void SVMCompiler::generate_svm_nodes(const ShaderNodeSet &nodes, CompilerState *
 
 void SVMCompiler::generate_closure_node(ShaderNode *node, CompilerState *state)
 {
+  /* Skip generating closure that are not supported or needed for a particular
+   * type of shader. For example a BSDF in a volume shader. */
+  const int node_feature = node->get_feature();
+  if ((state->node_feature_mask & node_feature) != node_feature) {
+    return;
+  }
+
   /* execute dependencies for closure */
   foreach (ShaderInput *in, node->inputs) {
     if (in->link != NULL) {
@@ -555,7 +564,7 @@ void SVMCompiler::find_aov_nodes_and_dependencies(ShaderNodeSet &aov_nodes,
   foreach (ShaderNode *node, graph->nodes) {
     if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) {
       OutputAOVNode *aov_node = static_cast<OutputAOVNode *>(node);
-      if (aov_node->slot >= 0) {
+      if (aov_node->offset >= 0) {
         aov_nodes.insert(aov_node);
         foreach (ShaderInput *in, node->inputs) {
           if (in->link != NULL) {
@@ -785,17 +794,21 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
         case SHADER_TYPE_SURFACE: /* generate surface shader */
           generate = true;
           shader->has_surface = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE;
           break;
         case SHADER_TYPE_VOLUME: /* generate volume shader */
           generate = true;
           shader->has_volume = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_VOLUME;
           break;
         case SHADER_TYPE_DISPLACEMENT: /* generate displacement shader */
           generate = true;
           shader->has_displacement = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_DISPLACEMENT;
           break;
         case SHADER_TYPE_BUMP: /* generate bump shader */
           generate = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_BUMP;
           break;
         default:
           break;
@@ -867,6 +880,7 @@ void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Sum
   shader->has_surface = false;
   shader->has_surface_emission = false;
   shader->has_surface_transparent = false;
+  shader->has_surface_raytrace = false;
   shader->has_surface_bssrdf = false;
   shader->has_bump = has_bump;
   shader->has_bssrdf_bump = has_bump;
@@ -964,6 +978,7 @@ SVMCompiler::CompilerState::CompilerState(ShaderGraph *graph)
     max_id = max(node->id, max_id);
   }
   nodes_done_flag.resize(max_id + 1, false);
+  node_feature_mask = 0;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index d23ff3e2a47..0353c393ae4 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -192,6 +192,9 @@ class SVMCompiler {
      * all areas to use this flags array.
      */
     vector<bool> nodes_done_flag;
+
+    /* Node features that can be compiled. */
+    uint node_feature_mask;
   };
 
   void stack_clear_temporary(ShaderNode *node);
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 375c9fd8e09..eed75cc2372 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -16,601 +16,559 @@
 
 #include "render/tile.h"
 
+#include <atomic>
+
+#include "graph/node.h"
+#include "render/background.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/scene.h"
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
 #include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-namespace {
+/* --------------------------------------------------------------------
+ * Internal functions.
+ */
 
-class TileComparator {
- public:
-  TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
-      : order(order_), center(center_), tiles(tiles_)
-  {
-  }
+static const char *ATTR_PASSES_COUNT = "cycles.passes.count";
+static const char *ATTR_PASS_SOCKET_PREFIX_FORMAT = "cycles.passes.%d.";
+static const char *ATTR_BUFFER_SOCKET_PREFIX = "cycles.buffer.";
+static const char *ATTR_DENOISE_SOCKET_PREFIX = "cycles.denoise.";
 
-  bool operator()(int a, int b)
-  {
-    switch (order) {
-      case TILE_CENTER: {
-        float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w / 2),
-                                    center.y - (tiles[a].y + tiles[a].h / 2));
-        float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w / 2),
-                                    center.y - (tiles[b].y + tiles[b].h / 2));
-        return dot(dist_a, dist_a) < dot(dist_b, dist_b);
-      }
-      case TILE_LEFT_TO_RIGHT:
-        return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x < tiles[b].x);
-      case TILE_RIGHT_TO_LEFT:
-        return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x > tiles[b].x);
-      case TILE_TOP_TO_BOTTOM:
-        return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y > tiles[b].y);
-      case TILE_BOTTOM_TO_TOP:
-      default:
-        return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y < tiles[b].y);
+/* Global counter of ToleManager object instances. */
+static std::atomic<uint64_t> g_instance_index = 0;
+
+/* Construct names of EXR channels which will ensure order of all channels to match exact offsets
+ * in render buffers corresponding to the given passes.
+ *
+ * Returns `std` datatypes so that it can be assigned directly to the OIIO's `ImageSpec`. */
+static std::vector<std::string> exr_channel_names_for_passes(const BufferParams &buffer_params)
+{
+  static const char *component_suffixes[] = {"R", "G", "B", "A"};
+
+  int pass_index = 0;
+  int num_channels = 0;
+  std::vector<std::string> channel_names;
+  for (const BufferPass &pass : buffer_params.passes) {
+    if (pass.offset == PASS_UNUSED) {
+      continue;
     }
-  }
 
- protected:
-  TileOrder order;
-  int2 center;
-  Tile *tiles;
-};
+    const PassInfo pass_info = pass.get_info();
+    num_channels += pass_info.num_components;
 
-inline int2 hilbert_index_to_pos(int n, int d)
-{
-  int2 r, xy = make_int2(0, 0);
-  for (int s = 1; s < n; s *= 2) {
-    r.x = (d >> 1) & 1;
-    r.y = (d ^ r.x) & 1;
-    if (!r.y) {
-      if (r.x) {
-        xy = make_int2(s - 1, s - 1) - xy;
-      }
-      swap(xy.x, xy.y);
+    /* EXR canonically expects first part of channel names to be sorted alphabetically, which is
+     * not guaranteed to be the case with passes names. Assign a prefix based on the pass index
+     * with a fixed width to ensure ordering. This makes it possible to dump existing render
+     * buffers memory to disk and read it back without doing extra mapping. */
+    const string prefix = string_printf("%08d", pass_index);
+
+    const string channel_name_prefix = prefix + string(pass.name) + ".";
+
+    for (int i = 0; i < pass_info.num_components; ++i) {
+      channel_names.push_back(channel_name_prefix + component_suffixes[i]);
     }
-    xy += r * make_int2(s, s);
-    d >>= 2;
+
+    ++pass_index;
   }
-  return xy;
+
+  return channel_names;
 }
 
-enum SpiralDirection {
-  DIRECTION_UP,
-  DIRECTION_LEFT,
-  DIRECTION_DOWN,
-  DIRECTION_RIGHT,
-};
-
-} /* namespace */
-
-TileManager::TileManager(bool progressive_,
-                         int num_samples_,
-                         int2 tile_size_,
-                         int start_resolution_,
-                         bool preserve_tile_device_,
-                         bool background_,
-                         TileOrder tile_order_,
-                         int num_devices_,
-                         int pixel_size_)
+inline string node_socket_attribute_name(const SocketType &socket, const string &attr_name_prefix)
 {
-  progressive = progressive_;
-  tile_size = tile_size_;
-  tile_order = tile_order_;
-  start_resolution = start_resolution_;
-  pixel_size = pixel_size_;
-  slice_overlap = 0;
-  num_samples = num_samples_;
-  num_devices = num_devices_;
-  preserve_tile_device = preserve_tile_device_;
-  background = background_;
-  schedule_denoising = false;
-
-  range_start_sample = 0;
-  range_num_samples = -1;
-
-  BufferParams buffer_params;
-  reset(buffer_params, 0);
+  return attr_name_prefix + string(socket.name);
 }
 
-TileManager::~TileManager()
+template<typename ValidateValueFunc, typename GetValueFunc>
+static bool node_socket_generic_to_image_spec_atttributes(
+    ImageSpec *image_spec,
+    const Node *node,
+    const SocketType &socket,
+    const string &attr_name_prefix,
+    const ValidateValueFunc &validate_value_func,
+    const GetValueFunc &get_value_func)
 {
+  if (!validate_value_func(node, socket)) {
+    return false;
+  }
+
+  image_spec->attribute(node_socket_attribute_name(socket, attr_name_prefix),
+                        get_value_func(node, socket));
+
+  return true;
 }
 
-void TileManager::device_free()
+static bool node_socket_to_image_spec_atttributes(ImageSpec *image_spec,
+                                                  const Node *node,
+                                                  const SocketType &socket,
+                                                  const string &attr_name_prefix)
 {
-  if (schedule_denoising || progressive) {
-    for (int i = 0; i < state.tiles.size(); i++) {
-      delete state.tiles[i].buffers;
-      state.tiles[i].buffers = NULL;
+  const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+  switch (socket.type) {
+    case SocketType::ENUM: {
+      const ustring value = node->get_string(socket);
+
+      /* Validate that the node is consistent with the node type definition. */
+      const NodeEnum &enum_values = *socket.enum_values;
+      if (!enum_values.exists(value)) {
+        LOG(DFATAL) << "Node enum contains invalid value " << value;
+        return false;
+      }
+
+      image_spec->attribute(attr_name, value);
+
+      return true;
     }
-  }
 
-  state.tiles.clear();
+    case SocketType::STRING:
+      image_spec->attribute(attr_name, node->get_string(socket));
+      return true;
+
+    case SocketType::INT:
+      image_spec->attribute(attr_name, node->get_int(socket));
+      return true;
+
+    case SocketType::FLOAT:
+      image_spec->attribute(attr_name, node->get_float(socket));
+      return true;
+
+    case SocketType::BOOLEAN:
+      image_spec->attribute(attr_name, node->get_bool(socket));
+      return true;
+
+    default:
+      LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+      return false;
+  }
 }
 
-static int get_divider(int w, int h, int start_resolution)
+static bool node_socket_from_image_spec_atttributes(Node *node,
+                                                    const SocketType &socket,
+                                                    const ImageSpec &image_spec,
+                                                    const string &attr_name_prefix)
 {
-  int divider = 1;
-  if (start_resolution != INT_MAX) {
-    while (w * h > start_resolution * start_resolution) {
-      w = max(1, w / 2);
-      h = max(1, h / 2);
+  const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+  switch (socket.type) {
+    case SocketType::ENUM: {
+      /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+      const ustring value(image_spec.get_string_attribute(attr_name, ""));
+
+      /* Validate that the node is consistent with the node type definition. */
+      const NodeEnum &enum_values = *socket.enum_values;
+      if (!enum_values.exists(value)) {
+        LOG(ERROR) << "Invalid enumerator value " << value;
+        return false;
+      }
 
-      divider <<= 1;
+      node->set(socket, enum_values[value]);
+
+      return true;
     }
+
+    case SocketType::STRING:
+      /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+      node->set(socket, ustring(image_spec.get_string_attribute(attr_name, "")));
+      return true;
+
+    case SocketType::INT:
+      node->set(socket, image_spec.get_int_attribute(attr_name, 0));
+      return true;
+
+    case SocketType::FLOAT:
+      node->set(socket, image_spec.get_float_attribute(attr_name, 0));
+      return true;
+
+    case SocketType::BOOLEAN:
+      node->set(socket, static_cast<bool>(image_spec.get_int_attribute(attr_name, 0)));
+      return true;
+
+    default:
+      LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+      return false;
   }
-  return divider;
 }
 
-void TileManager::reset(BufferParams &params_, int num_samples_)
+static bool node_to_image_spec_atttributes(ImageSpec *image_spec,
+                                           const Node *node,
+                                           const string &attr_name_prefix)
 {
-  params = params_;
-
-  set_samples(num_samples_);
-
-  state.buffer = BufferParams();
-  state.sample = range_start_sample - 1;
-  state.num_tiles = 0;
-  state.num_samples = 0;
-  state.resolution_divider = get_divider(params.width, params.height, start_resolution);
-  state.render_tiles.clear();
-  state.denoising_tiles.clear();
-  device_free();
+  for (const SocketType &socket : node->type->inputs) {
+    if (!node_socket_to_image_spec_atttributes(image_spec, node, socket, attr_name_prefix)) {
+      return false;
+    }
+  }
+
+  return true;
 }
 
-void TileManager::set_samples(int num_samples_)
+static bool node_from_image_spec_atttributes(Node *node,
+                                             const ImageSpec &image_spec,
+                                             const string &attr_name_prefix)
 {
-  num_samples = num_samples_;
+  for (const SocketType &socket : node->type->inputs) {
+    if (!node_socket_from_image_spec_atttributes(node, socket, image_spec, attr_name_prefix)) {
+      return false;
+    }
+  }
+
+  return true;
+}
 
-  /* No real progress indication is possible when using unlimited samples. */
-  if (num_samples == INT_MAX) {
-    state.total_pixel_samples = 0;
+static bool buffer_params_to_image_spec_atttributes(ImageSpec *image_spec,
+                                                    const BufferParams &buffer_params)
+{
+  if (!node_to_image_spec_atttributes(image_spec, &buffer_params, ATTR_BUFFER_SOCKET_PREFIX)) {
+    return false;
   }
-  else {
-    uint64_t pixel_samples = 0;
-    /* While rendering in the viewport, the initial preview resolution is increased to the native
-     * resolution before the actual rendering begins. Therefore, additional pixel samples will be
-     * rendered. */
-    int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size);
-    while (divider > pixel_size) {
-      int image_w = max(1, params.width / divider);
-      int image_h = max(1, params.height / divider);
-      pixel_samples += image_w * image_h;
-      divider >>= 1;
-    }
 
-    int image_w = max(1, params.width / divider);
-    int image_h = max(1, params.height / divider);
-    state.total_pixel_samples = pixel_samples +
-                                (uint64_t)get_num_effective_samples() * image_w * image_h;
-    if (schedule_denoising) {
-      state.total_pixel_samples += params.width * params.height;
+  /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+  const int num_passes = buffer_params.passes.size();
+  image_spec->attribute(ATTR_PASSES_COUNT, num_passes);
+
+  for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+    const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
+
+    const BufferPass *pass = &buffer_params.passes[pass_index];
+    if (!node_to_image_spec_atttributes(image_spec, pass, attr_name_prefix)) {
+      return false;
     }
   }
+
+  return true;
 }
 
-/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render
- * device. If sliced is true, slice image into as much pieces as how many devices are rendering
- * this image. */
-int TileManager::gen_tiles(bool sliced)
+static bool buffer_params_from_image_spec_atttributes(BufferParams *buffer_params,
+                                                      const ImageSpec &image_spec)
 {
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
-  int2 center = make_int2(image_w / 2, image_h / 2);
-
-  int num = preserve_tile_device || sliced ? min(image_h, num_devices) : 1;
-  int slice_num = sliced ? num : 1;
-  int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-
-  device_free();
-  state.render_tiles.clear();
-  state.denoising_tiles.clear();
-  state.render_tiles.resize(num);
-  state.denoising_tiles.resize(num);
-  state.tile_stride = tile_w;
-  vector<list<int>>::iterator tile_list;
-  tile_list = state.render_tiles.begin();
-
-  if (tile_order == TILE_HILBERT_SPIRAL) {
-    assert(!sliced && slice_overlap == 0);
-
-    int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
-    state.tiles.resize(tile_w * tile_h);
-
-    /* Size of blocks in tiles, must be a power of 2 */
-    const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12) ? 8 : 4;
-
-    int tiles_per_device = divide_up(tile_w * tile_h, num);
-    int cur_device = 0, cur_tiles = 0;
-
-    int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
-    /* Number of blocks to fill the image */
-    int blocks_x = (block_size.x >= image_w) ? 1 : divide_up(image_w, block_size.x);
-    int blocks_y = (block_size.y >= image_h) ? 1 : divide_up(image_h, block_size.y);
-    int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
-    /* Offset of spiral (to keep it centered) */
-    int2 offset = make_int2((image_w - n * block_size.x) / 2, (image_h - n * block_size.y) / 2);
-    offset = (offset / tile_size) * tile_size; /* Round to tile border. */
-
-    int2 block = make_int2(0, 0); /* Current block */
-    SpiralDirection prev_dir = DIRECTION_UP, dir = DIRECTION_UP;
-    for (int i = 0;;) {
-      /* Generate the tiles in the current block. */
-      for (int hilbert_index = 0; hilbert_index < hilbert_size * hilbert_size; hilbert_index++) {
-        int2 tile, hilbert_pos = hilbert_index_to_pos(hilbert_size, hilbert_index);
-        /* Rotate block according to spiral direction. */
-        if (prev_dir == DIRECTION_UP && dir == DIRECTION_UP) {
-          tile = make_int2(hilbert_pos.y, hilbert_pos.x);
-        }
-        else if (dir == DIRECTION_LEFT || prev_dir == DIRECTION_LEFT) {
-          tile = hilbert_pos;
-        }
-        else if (dir == DIRECTION_DOWN) {
-          tile = make_int2(hilbert_size - 1 - hilbert_pos.y, hilbert_size - 1 - hilbert_pos.x);
-        }
-        else {
-          tile = make_int2(hilbert_size - 1 - hilbert_pos.x, hilbert_size - 1 - hilbert_pos.y);
-        }
-
-        int2 pos = block * block_size + tile * tile_size + offset;
-        /* Only add tiles which are in the image (tiles outside of the image can be generated since
-         * the spiral is always square). */
-        if (pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
-          int w = min(tile_size.x, image_w - pos.x);
-          int h = min(tile_size.y, image_h - pos.y);
-          int2 ipos = pos / tile_size;
-          int idx = ipos.y * tile_w + ipos.x;
-          state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
-          tile_list->push_front(idx);
-          cur_tiles++;
-
-          if (cur_tiles == tiles_per_device) {
-            tile_list++;
-            cur_tiles = 0;
-            cur_device++;
-          }
-        }
-      }
+  if (!node_from_image_spec_atttributes(buffer_params, image_spec, ATTR_BUFFER_SOCKET_PREFIX)) {
+    return false;
+  }
 
-      /* Stop as soon as the spiral has reached the center block. */
-      if (block.x == (n - 1) / 2 && block.y == (n - 1) / 2)
-        break;
-
-      /* Advance to next block. */
-      prev_dir = dir;
-      switch (dir) {
-        case DIRECTION_UP:
-          block.y++;
-          if (block.y == (n - i - 1)) {
-            dir = DIRECTION_LEFT;
-          }
-          break;
-        case DIRECTION_LEFT:
-          block.x++;
-          if (block.x == (n - i - 1)) {
-            dir = DIRECTION_DOWN;
-          }
-          break;
-        case DIRECTION_DOWN:
-          block.y--;
-          if (block.y == i) {
-            dir = DIRECTION_RIGHT;
-          }
-          break;
-        case DIRECTION_RIGHT:
-          block.x--;
-          if (block.x == i + 1) {
-            dir = DIRECTION_UP;
-            i++;
-          }
-          break;
-      }
-    }
-    return tile_w * tile_h;
+  /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+  const int num_passes = image_spec.get_int_attribute(ATTR_PASSES_COUNT, 0);
+  if (num_passes == 0) {
+    LOG(ERROR) << "Missing passes count attribute.";
+    return false;
   }
 
-  int idx = 0;
-  for (int slice = 0; slice < slice_num; slice++) {
-    int slice_y = (image_h / slice_num) * slice;
-    int slice_h = (slice == slice_num - 1) ? image_h - slice * (image_h / slice_num) :
-                                             image_h / slice_num;
+  for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+    const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
 
-    if (slice_overlap != 0) {
-      int slice_y_offset = max(slice_y - slice_overlap, 0);
-      slice_h = min(slice_y + slice_h + slice_overlap, image_h) - slice_y_offset;
-      slice_y = slice_y_offset;
-    }
+    BufferPass pass;
 
-    int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
-    int tiles_per_device = divide_up(tile_w * tile_h, num);
-    int cur_device = 0, cur_tiles = 0;
-
-    for (int tile_y = 0; tile_y < tile_h; tile_y++) {
-      for (int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
-        int x = tile_x * tile_size.x;
-        int y = tile_y * tile_size.y;
-        int w = (tile_x == tile_w - 1) ? image_w - x : tile_size.x;
-        int h = (tile_y == tile_h - 1) ? slice_h - y : tile_size.y;
-
-        state.tiles.push_back(
-            Tile(idx, x, y + slice_y, w, h, sliced ? slice : cur_device, Tile::RENDER));
-        tile_list->push_back(idx);
-
-        if (!sliced) {
-          cur_tiles++;
-
-          if (cur_tiles == tiles_per_device) {
-            /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that
-             * case. */
-            if (tile_order != TILE_BOTTOM_TO_TOP) {
-              tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
-            }
-            tile_list++;
-            cur_tiles = 0;
-            cur_device++;
-          }
-        }
-      }
-    }
-    if (sliced) {
-      tile_list++;
+    if (!node_from_image_spec_atttributes(&pass, image_spec, attr_name_prefix)) {
+      return false;
     }
+
+    buffer_params->passes.emplace_back(std::move(pass));
   }
 
-  return idx;
+  buffer_params->update_passes();
+
+  return true;
 }
 
-void TileManager::gen_render_tiles()
+/* Configure image specification for the given buffer parameters and passes.
+ *
+ * Image channels will ber strictly ordered to match content of corresponding buffer, and the
+ * metadata will be set so that the render buffers and passes can be reconstructed from it.
+ *
+ * If the tile size different from (0, 0) the image specification will be configured to use the
+ * given tile size for tiled IO. */
+static bool configure_image_spec_from_buffer(ImageSpec *image_spec,
+                                             const BufferParams &buffer_params,
+                                             const int2 tile_size = make_int2(0, 0))
 {
-  /* Regenerate just the render tiles for progressive render. */
-  foreach (Tile &tile, state.tiles) {
-    tile.state = Tile::RENDER;
-    state.render_tiles[tile.device].push_back(tile.index);
+  const std::vector<std::string> channel_names = exr_channel_names_for_passes(buffer_params);
+  const int num_channels = channel_names.size();
+
+  *image_spec = ImageSpec(
+      buffer_params.width, buffer_params.height, num_channels, TypeDesc::FLOAT);
+
+  image_spec->channelnames = move(channel_names);
+
+  if (!buffer_params_to_image_spec_atttributes(image_spec, buffer_params)) {
+    return false;
+  }
+
+  if (tile_size.x != 0 || tile_size.y != 0) {
+    DCHECK_GT(tile_size.x, 0);
+    DCHECK_GT(tile_size.y, 0);
+
+    image_spec->tile_width = tile_size.x;
+    image_spec->tile_height = tile_size.y;
   }
+
+  return true;
 }
 
-void TileManager::set_tiles()
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
+
+TileManager::TileManager()
 {
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
+  /* Use process ID to separate different processes.
+   * To ensure uniqueness from within a process use combination of object address and instance
+   * index. This solves problem of possible object re-allocation at the same time, and solves
+   * possible conflict when the counter overflows while there are still active instances of the
+   * class. */
+  const int tile_manager_id = g_instance_index.fetch_add(1, std::memory_order_relaxed);
+  tile_file_unique_part_ = to_string(system_self_process_id()) + "-" +
+                           to_string(reinterpret_cast<uintptr_t>(this)) + "-" +
+                           to_string(tile_manager_id);
+}
 
-  state.num_tiles = gen_tiles(!background);
+TileManager::~TileManager()
+{
+}
+
+void TileManager::reset_scheduling(const BufferParams &params, int2 tile_size)
+{
+  VLOG(3) << "Using tile size of " << tile_size;
+
+  close_tile_output();
+
+  tile_size_ = tile_size;
+
+  tile_state_.num_tiles_x = divide_up(params.width, tile_size_.x);
+  tile_state_.num_tiles_y = divide_up(params.height, tile_size_.y);
+  tile_state_.num_tiles = tile_state_.num_tiles_x * tile_state_.num_tiles_y;
+
+  tile_state_.next_tile_index = 0;
+
+  tile_state_.current_tile = Tile();
+}
+
+void TileManager::update(const BufferParams &params, const Scene *scene)
+{
+  DCHECK_NE(params.pass_stride, -1);
+
+  buffer_params_ = params;
 
-  state.buffer.width = image_w;
-  state.buffer.height = image_h;
+  /* TODO(sergey): Proper Error handling, so that if configuration has failed we dont' attempt to
+   * write to a partially configured file. */
+  configure_image_spec_from_buffer(&write_state_.image_spec, buffer_params_, tile_size_);
 
-  state.buffer.full_x = params.full_x / resolution;
-  state.buffer.full_y = params.full_y / resolution;
-  state.buffer.full_width = max(1, params.full_width / resolution);
-  state.buffer.full_height = max(1, params.full_height / resolution);
+  const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+  node_to_image_spec_atttributes(
+      &write_state_.image_spec, &denoise_params, ATTR_DENOISE_SOCKET_PREFIX);
 }
 
-int TileManager::get_neighbor_index(int index, int neighbor)
+bool TileManager::done()
 {
-  /* Neighbor indices:
-   *   0 1 2
-   *   3 4 5
-   *   6 7 8
-   */
-  static const int dx[] = {-1, 0, 1, -1, 0, 1, -1, 0, 1};
-  static const int dy[] = {-1, -1, -1, 0, 0, 0, 1, 1, 1};
-
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
-
-  int num = min(image_h, num_devices);
-  int slice_num = !background ? num : 1;
-  int slice_h = image_h / slice_num;
-
-  int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-  int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
-  /* Tiles in the state tile list are always indexed from left to right, top to bottom. */
-  int nx = (index % tile_w) + dx[neighbor];
-  int ny = (index / tile_w) + dy[neighbor];
-  if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h * slice_num)
-    return -1;
-
-  return ny * state.tile_stride + nx;
+  return tile_state_.next_tile_index == tile_state_.num_tiles;
 }
 
-/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state
- * min_state. */
-bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+bool TileManager::next()
 {
-  if (index < 0 || state.tiles[index].state < min_state) {
+  if (done()) {
     return false;
   }
-  for (int neighbor = 0; neighbor < 9; neighbor++) {
-    int nindex = get_neighbor_index(index, neighbor);
-    /* Out-of-bounds tiles don't matter. */
-    if (nindex >= 0 && state.tiles[nindex].state < min_state) {
-      return false;
-    }
-  }
+
+  tile_state_.current_tile = get_tile_for_index(tile_state_.next_tile_index);
+
+  ++tile_state_.next_tile_index;
 
   return true;
 }
 
-/* Returns whether the tile should be written (and freed if no denoising is used) instead of
- * updating. */
-bool TileManager::finish_tile(const int index, const bool need_denoise, bool &delete_tile)
+Tile TileManager::get_tile_for_index(int index) const
 {
-  delete_tile = false;
-
-  switch (state.tiles[index].state) {
-    case Tile::RENDER: {
-      if (!(schedule_denoising && need_denoise)) {
-        state.tiles[index].state = Tile::DONE;
-        delete_tile = !progressive;
-        return true;
-      }
-      state.tiles[index].state = Tile::RENDERED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been
-       * rendered. If yes, it can be denoised. */
-      for (int neighbor = 0; neighbor < 9; neighbor++) {
-        int nindex = get_neighbor_index(index, neighbor);
-        if (check_neighbor_state(nindex, Tile::RENDERED)) {
-          state.tiles[nindex].state = Tile::DENOISE;
-          state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
-        }
-      }
-      return false;
-    }
-    case Tile::DENOISE: {
-      state.tiles[index].state = Tile::DENOISED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been
-       * denoised. If yes, it can be freed. */
-      for (int neighbor = 0; neighbor < 9; neighbor++) {
-        int nindex = get_neighbor_index(index, neighbor);
-        if (check_neighbor_state(nindex, Tile::DENOISED)) {
-          state.tiles[nindex].state = Tile::DONE;
-          /* Do not delete finished tiles in progressive mode. */
-          if (!progressive) {
-            /* It can happen that the tile just finished denoising and already can be freed here.
-             * However, in that case it still has to be written before deleting, so we can't delete
-             * it yet. */
-            if (neighbor == 4) {
-              delete_tile = true;
-            }
-            else {
-              delete state.tiles[nindex].buffers;
-              state.tiles[nindex].buffers = NULL;
-            }
-          }
-        }
-      }
-      return true;
-    }
-    default:
-      assert(false);
-      return true;
+  /* TODO(sergey): Consider using hilbert spiral, or. maybe, even configurable. Not sure this
+   * brings a lot of value since this is only applicable to BIG tiles. */
+
+  const int tile_y = index / tile_state_.num_tiles_x;
+  const int tile_x = index - tile_y * tile_state_.num_tiles_x;
+
+  Tile tile;
+
+  tile.x = tile_x * tile_size_.x;
+  tile.y = tile_y * tile_size_.y;
+  tile.width = tile_size_.x;
+  tile.height = tile_size_.y;
+
+  tile.width = min(tile.width, buffer_params_.width - tile.x);
+  tile.height = min(tile.height, buffer_params_.height - tile.y);
+
+  return tile;
+}
+
+const Tile &TileManager::get_current_tile() const
+{
+  return tile_state_.current_tile;
+}
+
+bool TileManager::open_tile_output()
+{
+  write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
+                                        to_string(write_state_.tile_file_index) + ".exr");
+
+  write_state_.tile_out = ImageOutput::create(write_state_.filename);
+  if (!write_state_.tile_out) {
+    LOG(ERROR) << "Error creating image output for " << write_state_.filename;
+    return false;
+  }
+
+  if (!write_state_.tile_out->supports("tiles")) {
+    LOG(ERROR) << "Progress tile file format does not support tiling.";
+    return false;
   }
+
+  write_state_.tile_out->open(write_state_.filename, write_state_.image_spec);
+  write_state_.num_tiles_written = 0;
+
+  VLOG(3) << "Opened tile file " << write_state_.filename;
+
+  return true;
 }
 
-bool TileManager::next_tile(Tile *&tile, int device, uint tile_types)
+bool TileManager::close_tile_output()
 {
-  /* Preserve device if requested, unless this is a separate denoising device that just wants to
-   * grab any available tile. */
-  const bool preserve_device = preserve_tile_device && device < num_devices;
-
-  if (tile_types & RenderTile::DENOISE) {
-    int tile_index = -1;
-    int logical_device = preserve_device ? device : 0;
-
-    while (logical_device < state.denoising_tiles.size()) {
-      if (state.denoising_tiles[logical_device].empty()) {
-        if (preserve_device) {
-          break;
-        }
-        else {
-          logical_device++;
-          continue;
-        }
-      }
+  if (!write_state_.tile_out) {
+    return true;
+  }
 
-      tile_index = state.denoising_tiles[logical_device].front();
-      state.denoising_tiles[logical_device].pop_front();
-      break;
-    }
+  const bool success = write_state_.tile_out->close();
+  write_state_.tile_out = nullptr;
 
-    if (tile_index >= 0) {
-      tile = &state.tiles[tile_index];
-      return true;
-    }
+  if (!success) {
+    LOG(ERROR) << "Error closing tile file.";
+    return false;
   }
 
-  if (tile_types & RenderTile::PATH_TRACE) {
-    int tile_index = -1;
-    int logical_device = preserve_device ? device : 0;
-
-    while (logical_device < state.render_tiles.size()) {
-      if (state.render_tiles[logical_device].empty()) {
-        if (preserve_device) {
-          break;
-        }
-        else {
-          logical_device++;
-          continue;
-        }
-      }
+  VLOG(3) << "Tile output is closed.";
 
-      tile_index = state.render_tiles[logical_device].front();
-      state.render_tiles[logical_device].pop_front();
-      break;
+  return true;
+}
+
+bool TileManager::write_tile(const RenderBuffers &tile_buffers)
+{
+  if (!write_state_.tile_out) {
+    if (!open_tile_output()) {
+      return false;
     }
+  }
 
-    if (tile_index >= 0) {
-      tile = &state.tiles[tile_index];
-      return true;
+  DCHECK_EQ(tile_buffers.params.pass_stride, buffer_params_.pass_stride);
+
+  const BufferParams &tile_params = tile_buffers.params;
+
+  vector<float> pixel_storage;
+  const float *pixels = tile_buffers.buffer.data();
+
+  /* Tiled writing expects pixels to contain data for an entire tile. Pad the render buffers with
+   * empty pixels for tiles which are on the image boundary. */
+  if (tile_params.width != tile_size_.x || tile_params.height != tile_size_.y) {
+    const int64_t pass_stride = tile_params.pass_stride;
+    const int64_t src_row_stride = tile_params.width * pass_stride;
+
+    const int64_t dst_row_stride = tile_size_.x * pass_stride;
+    pixel_storage.resize(dst_row_stride * tile_size_.y);
+
+    const float *src = tile_buffers.buffer.data();
+    float *dst = pixel_storage.data();
+    pixels = dst;
+
+    for (int y = 0; y < tile_params.height; ++y, src += src_row_stride, dst += dst_row_stride) {
+      memcpy(dst, src, src_row_stride * sizeof(float));
     }
   }
 
-  return false;
-}
+  const int tile_x = tile_params.full_x - buffer_params_.full_x;
+  const int tile_y = tile_params.full_y - buffer_params_.full_y;
 
-bool TileManager::done()
-{
-  int end_sample = (range_num_samples == -1) ? num_samples :
-                                               range_start_sample + range_num_samples;
-  return (state.resolution_divider == pixel_size) &&
-         (state.sample + state.num_samples >= end_sample);
+  VLOG(3) << "Write tile at " << tile_x << ", " << tile_y;
+  if (!write_state_.tile_out->write_tile(tile_x, tile_y, 0, TypeDesc::FLOAT, pixels)) {
+    LOG(ERROR) << "Error writing tile " << write_state_.tile_out->geterror();
+  }
+
+  ++write_state_.num_tiles_written;
+
+  return true;
 }
 
-bool TileManager::has_tiles()
+void TileManager::finish_write_tiles()
 {
-  foreach (Tile &tile, state.tiles) {
-    if (tile.state != Tile::DONE) {
-      return true;
+  if (!write_state_.tile_out) {
+    /* None of the tiles were written hence the file was not created.
+     * Avoid creation of fully empty file since it is redundant. */
+    return;
+  }
+
+  /* EXR expects all tiles to present in file. So explicitly write missing tiles as all-zero. */
+  if (write_state_.num_tiles_written < tile_state_.num_tiles) {
+    vector<float> pixel_storage(tile_size_.x * tile_size_.y * buffer_params_.pass_stride);
+
+    for (int tile_index = write_state_.num_tiles_written; tile_index < tile_state_.num_tiles;
+         ++tile_index) {
+      const Tile tile = get_tile_for_index(tile_index);
+
+      VLOG(3) << "Write dummy tile at " << tile.x << ", " << tile.y;
+
+      write_state_.tile_out->write_tile(tile.x, tile.y, 0, TypeDesc::FLOAT, pixel_storage.data());
     }
   }
-  return false;
+
+  close_tile_output();
+
+  if (full_buffer_written_cb) {
+    full_buffer_written_cb(write_state_.filename);
+  }
+
+  /* Advance the counter upon explicit finish of the file.
+   * Makes it possible to re-use tile manager for another scene, and avoids unnecessary increments
+   * of the tile-file-within-session index. */
+  ++write_state_.tile_file_index;
+
+  write_state_.filename = "";
 }
 
-bool TileManager::next()
+bool TileManager::read_full_buffer_from_disk(const string_view filename,
+                                             RenderBuffers *buffers,
+                                             DenoiseParams *denoise_params)
 {
-  if (done())
+  unique_ptr<ImageInput> in(ImageInput::open(filename));
+  if (!in) {
+    LOG(ERROR) << "Error opening tile file " << filename;
     return false;
+  }
+
+  const ImageSpec &image_spec = in->spec();
 
-  if (progressive && state.resolution_divider > pixel_size) {
-    state.sample = 0;
-    state.resolution_divider = max(state.resolution_divider / 2, pixel_size);
-    state.num_samples = 1;
-    set_tiles();
+  BufferParams buffer_params;
+  if (!buffer_params_from_image_spec_atttributes(&buffer_params, image_spec)) {
+    return false;
   }
-  else {
-    state.sample++;
+  buffers->reset(buffer_params);
 
-    if (progressive)
-      state.num_samples = 1;
-    else if (range_num_samples == -1)
-      state.num_samples = num_samples;
-    else
-      state.num_samples = range_num_samples;
+  if (!node_from_image_spec_atttributes(denoise_params, image_spec, ATTR_DENOISE_SOCKET_PREFIX)) {
+    return false;
+  }
 
-    state.resolution_divider = pixel_size;
+  if (!in->read_image(TypeDesc::FLOAT, buffers->buffer.data())) {
+    LOG(ERROR) << "Error reading pixels from the tile file " << in->geterror();
+    return false;
+  }
 
-    if (state.sample == range_start_sample) {
-      set_tiles();
-    }
-    else {
-      gen_render_tiles();
-    }
+  if (!in->close()) {
+    LOG(ERROR) << "Error closing tile file " << in->geterror();
+    return false;
   }
 
   return true;
 }
 
-int TileManager::get_num_effective_samples()
-{
-  return (range_num_samples == -1) ? num_samples : range_num_samples;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 790a56f9445..124d0b3652c 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -14,159 +14,151 @@
  * limitations under the License.
  */
 
-#ifndef __TILE_H__
-#define __TILE_H__
-
-#include <limits.h>
+#pragma once
 
 #include "render/buffers.h"
-#include "util/util_list.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Tile */
+class DenoiseParams;
+class Scene;
+
+/* --------------------------------------------------------------------
+ * Tile.
+ */
 
 class Tile {
  public:
-  int index;
-  int x, y, w, h;
-  int device;
-  /* RENDER: The tile has to be rendered.
-   * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
-   * DENOISE: The tile can be denoised now.
-   * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
-   * DONE: The tile is finished and has been freed. */
-  typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
-  State state;
-  RenderBuffers *buffers;
+  int x = 0, y = 0;
+  int width = 0, height = 0;
 
   Tile()
   {
   }
-
-  Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
-      : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL)
-  {
-  }
 };
 
-/* Tile order */
-
-/* Note: this should match enum_tile_order in properties.py */
-enum TileOrder {
-  TILE_CENTER = 0,
-  TILE_RIGHT_TO_LEFT = 1,
-  TILE_LEFT_TO_RIGHT = 2,
-  TILE_TOP_TO_BOTTOM = 3,
-  TILE_BOTTOM_TO_TOP = 4,
-  TILE_HILBERT_SPIRAL = 5,
-};
-
-/* Tile Manager */
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
 
 class TileManager {
  public:
-  BufferParams params;
-
-  struct State {
-    vector<Tile> tiles;
-    int tile_stride;
-    BufferParams buffer;
-    int sample;
-    int num_samples;
-    int resolution_divider;
-    int num_tiles;
-
-    /* Total samples over all pixels: Generally num_samples*num_pixels,
-     * but can be higher due to the initial resolution division for previews. */
-    uint64_t total_pixel_samples;
-
-    /* These lists contain the indices of the tiles to be rendered/denoised and are used
-     * when acquiring a new tile for the device.
-     * Each list in each vector is for one logical device. */
-    vector<list<int>> render_tiles;
-    vector<list<int>> denoising_tiles;
-  } state;
-
-  int num_samples;
-  int slice_overlap;
-
-  TileManager(bool progressive,
-              int num_samples,
-              int2 tile_size,
-              int start_resolution,
-              bool preserve_tile_device,
-              bool background,
-              TileOrder tile_order,
-              int num_devices = 1,
-              int pixel_size = 1);
+  /* This callback is invoked by whenever on-dist tiles storage file is closed after writing. */
+  function<void(string_view)> full_buffer_written_cb;
+
+  TileManager();
   ~TileManager();
 
-  void device_free();
-  void reset(BufferParams &params, int num_samples);
-  void set_samples(int num_samples);
+  TileManager(const TileManager &other) = delete;
+  TileManager(TileManager &&other) noexcept = delete;
+  TileManager &operator=(const TileManager &other) = delete;
+  TileManager &operator=(TileManager &&other) = delete;
+
+  /* Reset current progress and start new rendering of the full-frame parameters in tiles of the
+   * given size.
+   * Only touches scheduling-related state of the tile manager. */
+  /* TODO(sergey): Consider using tile area instead of exact size to help dealing with extreme
+   * cases of stretched renders. */
+  void reset_scheduling(const BufferParams &params, int2 tile_size);
+
+  /* Update for the known buffer passes and scene parameters.
+   * Will store all parameters needed for buffers access outside of the scene graph. */
+  void update(const BufferParams &params, const Scene *scene);
+
+  inline int get_num_tiles() const
+  {
+    return tile_state_.num_tiles;
+  }
+
+  inline bool has_multiple_tiles() const
+  {
+    return tile_state_.num_tiles > 1;
+  }
+
   bool next();
-  bool next_tile(Tile *&tile, int device, uint tile_types);
-  bool finish_tile(const int index, const bool need_denoise, bool &delete_tile);
   bool done();
-  bool has_tiles();
 
-  void set_tile_order(TileOrder tile_order_)
+  const Tile &get_current_tile() const;
+
+  /* Write render buffer of a tile to a file on disk.
+   *
+   * Opens file for write when first tile is written.
+   *
+   * Returns true on success. */
+  bool write_tile(const RenderBuffers &tile_buffers);
+
+  /* Inform the tile manager that no more tiles will be written to disk.
+   * The file will be considered final, all handles to it will be closed. */
+  void finish_write_tiles();
+
+  /* Check whether any tile ahs been written to disk. */
+  inline bool has_written_tiles() const
   {
-    tile_order = tile_order_;
+    return write_state_.num_tiles_written != 0;
   }
 
-  int get_neighbor_index(int index, int neighbor);
-  bool check_neighbor_state(int index, Tile::State state);
+  /* Read full frame render buffer from tiles file on disk.
+   *
+   * Returns true on success. */
+  bool read_full_buffer_from_disk(string_view filename,
+                                  RenderBuffers *buffers,
+                                  DenoiseParams *denoise_params);
 
-  /* ** Sample range rendering. ** */
+ protected:
+  /* Get tile configuration for its index.
+   * The tile index must be within [0, state_.tile_state_). */
+  Tile get_tile_for_index(int index) const;
 
-  /* Start sample in the range. */
-  int range_start_sample;
+  bool open_tile_output();
+  bool close_tile_output();
 
-  /* Number to samples in the rendering range. */
-  int range_num_samples;
+  /* Part of an on-disk tile file name which avoids conflicts between several Cycles instances or
+   * several sessions. */
+  string tile_file_unique_part_;
 
-  /* Get number of actual samples to render. */
-  int get_num_effective_samples();
+  int2 tile_size_ = make_int2(0, 0);
 
-  /* Schedule tiles for denoising after they've been rendered. */
-  bool schedule_denoising;
+  BufferParams buffer_params_;
 
- protected:
-  void set_tiles();
-
-  bool progressive;
-  int2 tile_size;
-  TileOrder tile_order;
-  int start_resolution;
-  int pixel_size;
-  int num_devices;
-
-  /* in some cases it is important that the same tile will be returned for the same
-   * device it was originally generated for (i.e. viewport rendering when buffer is
-   * allocating once for tile and then always used by it)
-   *
-   * in other cases any tile could be handled by any device (i.e. final rendering
-   * without progressive refine)
-   */
-  bool preserve_tile_device;
-
-  /* for background render tiles should exactly match render parts generated from
-   * blender side, which means image first gets split into tiles and then tiles are
-   * assigning to render devices
-   *
-   * however viewport rendering expects tiles to be allocated in a special way,
-   * meaning image is being sliced horizontally first and every device handles
-   * its own slice
-   */
-  bool background;
-
-  /* Generate tile list, return number of tiles. */
-  int gen_tiles(bool sliced);
-  void gen_render_tiles();
+  /* Tile scheduling state. */
+  struct {
+    int num_tiles_x = 0;
+    int num_tiles_y = 0;
+    int num_tiles = 0;
+
+    int next_tile_index;
+
+    Tile current_tile;
+  } tile_state_;
+
+  /* State of tiles writing to a file on disk. */
+  struct {
+    /* Index of a tile file used during the current session.
+     * This number is used for the file name construction, making it possible to render several
+     * scenes throughout duration of the session and keep all results available for later read
+     * access. */
+    int tile_file_index = 0;
+
+    string filename;
+
+    /* Specification of the tile image which corresponds to the buffer parameters.
+     * Contains channels configured according to the passes configuration in the path traces.
+     *
+     * Output images are saved using this specification, input images are expected to have matched
+     * specification. */
+    ImageSpec image_spec;
+
+    /* Output handle for the tile file.
+     *
+     * This file can not be closed until all tiles has been provided, so the handle is stored in
+     * the state and is created whenever writing is requested. */
+    unique_ptr<ImageOutput> tile_out;
+
+    int num_tiles_written = 0;
+  } write_state_;
 };
 
 CCL_NAMESPACE_END
-
-#endif /* __TILE_H__ */
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 65a692acd03..0f6b435813f 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -32,6 +32,7 @@ set(INC
 set(ALL_CYCLES_LIBRARIES
   cycles_device
   cycles_kernel
+  cycles_integrator
   cycles_render
   cycles_bvh
   cycles_graph
@@ -45,8 +46,12 @@ include_directories(${INC})
 cycles_link_directories()
 
 set(SRC
+  integrator_adaptive_sampling_test.cpp
+  integrator_render_scheduler_test.cpp
+  integrator_tile_test.cpp
   render_graph_finalize_test.cpp
   util_aligned_malloc_test.cpp
+  util_math_test.cpp
   util_path_test.cpp
   util_string_test.cpp
   util_task_test.cpp
diff --git a/intern/cycles/test/integrator_adaptive_sampling_test.cpp b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
new file mode 100644
index 00000000000..3ed6a23125d
--- /dev/null
+++ b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/adaptive_sampling.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(AdaptiveSampling, schedule_samples)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 0;
+  adaptive_sampling.adaptive_step = 4;
+
+  for (int sample = 2; sample < 32; ++sample) {
+    for (int num_samples = 8; num_samples < 32; ++num_samples) {
+      const int num_samples_aligned = adaptive_sampling.align_samples(sample, num_samples);
+      /* NOTE: `sample + num_samples_aligned` is the number of samples after rendering, so need
+       * to convert this to the 0-based index of the last sample. */
+      EXPECT_TRUE(adaptive_sampling.need_filter(sample + num_samples_aligned - 1));
+    }
+  }
+}
+
+TEST(AdaptiveSampling, align_samples)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+  adaptive_sampling.adaptive_step = 4;
+
+  /* Filtering will happen at the following samples:
+   * 15, 19, 23, 27, 31, 35, 39, 43 */
+
+  /* Requested sample and number of samples will result in number of samples lower than
+   * `min_samples`. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 4), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 7), 7);
+
+  /* Request number of samples higher than the minimum samples before filter, but prior to the
+   * first sample at which filtering will happen. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 15), 15);
+
+  /* When rendering many samples from the very beginning, limit number of samples by the first
+   * sample at which filtering is to happen. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 16), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 17), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 20), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 60), 16);
+
+  /* Similar to above, but start sample is not 0. */
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 8), 7);
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 20), 7);
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 60), 7);
+
+  /* Start sample is past the minimum required samples, but prior to the first filter sample. */
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 6), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 20), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 60), 4);
+
+  /* Start sample is the sample which is to be filtered. */
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 4), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 6), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 10), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(58, 2), 2);
+
+  /* Start sample is past the sample which is to be filtered. */
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 3), 3);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 4), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 5), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 10), 4);
+
+  /* Should never exceed requested number of samples. */
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 2), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 2), 2);
+  EXPECT_EQ(adaptive_sampling.align_samples(17, 2), 2);
+  EXPECT_EQ(adaptive_sampling.align_samples(18, 2), 2);
+}
+
+TEST(AdaptiveSampling, need_filter)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+  adaptive_sampling.adaptive_step = 4;
+
+  const vector<int> expected_samples_to_filter = {
+      {15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59}};
+
+  vector<int> actual_samples_to_filter;
+  for (int sample = 0; sample < 60; ++sample) {
+    if (adaptive_sampling.need_filter(sample)) {
+      actual_samples_to_filter.push_back(sample);
+    }
+  }
+
+  EXPECT_EQ(actual_samples_to_filter, expected_samples_to_filter);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_render_scheduler_test.cpp b/intern/cycles/test/integrator_render_scheduler_test.cpp
new file mode 100644
index 00000000000..b4efbc2d1a7
--- /dev/null
+++ b/intern/cycles/test/integrator_render_scheduler_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/render_scheduler.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(IntegratorRenderScheduler, calculate_resolution_divider_for_resolution)
+{
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 1920), 1);
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 960), 2);
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 480), 4);
+}
+
+TEST(IntegratorRenderScheduler, calculate_resolution_for_divider)
+{
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 1), 1440);
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 2), 720);
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 4), 360);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp
new file mode 100644
index 00000000000..5bb57b48c3c
--- /dev/null
+++ b/intern/cycles/test/integrator_tile_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/tile.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(tile_calculate_best_size, Basic)
+{
+  /* Make sure CPU-like case is handled properly. */
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1), TileSize(1, 1, 1));
+
+  /* Enough path states to fit an entire image with all samples. */
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080),
+            TileSize(1920, 1080, 1));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100),
+            TileSize(1920, 1080, 100));
+}
+
+TEST(tile_calculate_best_size, Extreme)
+{
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072), TileSize(1, 1, 512));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072), TileSize(1, 1, 1024));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072), TileSize(1, 1, 4096));
+
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024),
+            TileSize(1, 1, 1024));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index da9b29314a7..19c211fe5f7 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -181,7 +181,7 @@ class RenderGraph : public testing::Test {
     util_logging_start();
     util_logging_verbosity_set(1);
 
-    device_cpu = Device::create(device_info, stats, profiler, true);
+    device_cpu = Device::create(device_info, stats, profiler);
     scene = new Scene(scene_params, device_cpu);
   }
 
diff --git a/intern/cycles/test/util_math_test.cpp b/intern/cycles/test/util_math_test.cpp
new file mode 100644
index 00000000000..b6ce3ef0cf3
--- /dev/null
+++ b/intern/cycles/test/util_math_test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(math, next_power_of_two)
+{
+  EXPECT_EQ(next_power_of_two(0), 1);
+  EXPECT_EQ(next_power_of_two(1), 2);
+  EXPECT_EQ(next_power_of_two(2), 4);
+  EXPECT_EQ(next_power_of_two(3), 4);
+  EXPECT_EQ(next_power_of_two(4), 8);
+}
+
+TEST(math, prev_power_of_two)
+{
+  EXPECT_EQ(prev_power_of_two(0), 0);
+
+  EXPECT_EQ(prev_power_of_two(1), 1);
+  EXPECT_EQ(prev_power_of_two(2), 1);
+
+  EXPECT_EQ(prev_power_of_two(3), 2);
+  EXPECT_EQ(prev_power_of_two(4), 2);
+
+  EXPECT_EQ(prev_power_of_two(5), 4);
+  EXPECT_EQ(prev_power_of_two(6), 4);
+  EXPECT_EQ(prev_power_of_two(7), 4);
+  EXPECT_EQ(prev_power_of_two(8), 4);
+}
+
+TEST(math, reverse_integer_bits)
+{
+  EXPECT_EQ(reverse_integer_bits(0xFFFFFFFF), 0xFFFFFFFF);
+  EXPECT_EQ(reverse_integer_bits(0x00000000), 0x00000000);
+  EXPECT_EQ(reverse_integer_bits(0x1), 0x80000000);
+  EXPECT_EQ(reverse_integer_bits(0x80000000), 0x1);
+  EXPECT_EQ(reverse_integer_bits(0xFFFF0000), 0x0000FFFF);
+  EXPECT_EQ(reverse_integer_bits(0x0000FFFF), 0xFFFF0000);
+  EXPECT_EQ(reverse_integer_bits(0x00FF0000), 0x0000FF00);
+  EXPECT_EQ(reverse_integer_bits(0x0000FF00), 0x00FF0000);
+  EXPECT_EQ(reverse_integer_bits(0xAAAAAAAA), 0x55555555);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 97f8daa65de..c9022d1b132 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -281,4 +281,40 @@ TEST(util_string_remove_trademark, r_space_middle)
   EXPECT_EQ(str, "foo bar baz");
 }
 
+/* ******** Tests for string_startswith() ******** */
+
+TEST(string_startswith, basic)
+{
+  EXPECT_TRUE(string_startswith("", ""));
+
+  EXPECT_FALSE(string_startswith("", "World"));
+  EXPECT_TRUE(string_startswith("Hello", ""));
+
+  EXPECT_FALSE(string_startswith("Hello", "World"));
+
+  EXPECT_TRUE(string_startswith("Hello", "Hello"));
+  EXPECT_TRUE(string_startswith("Hello", "He"));
+  EXPECT_TRUE(string_startswith("Hello", "H"));
+
+  EXPECT_FALSE(string_startswith("Hello", "e"));
+  EXPECT_FALSE(string_startswith("Hello", "HelloWorld"));
+}
+
+TEST(string_endswith, basic)
+{
+  EXPECT_TRUE(string_endswith("", ""));
+
+  EXPECT_FALSE(string_endswith("", "World"));
+  EXPECT_TRUE(string_endswith("Hello", ""));
+
+  EXPECT_FALSE(string_endswith("Hello", "World"));
+
+  EXPECT_TRUE(string_endswith("Hello", "Hello"));
+  EXPECT_TRUE(string_endswith("Hello", "lo"));
+  EXPECT_TRUE(string_endswith("Hello", "o"));
+
+  EXPECT_FALSE(string_endswith("Hello", "e"));
+  EXPECT_FALSE(string_endswith("Hello", "WorldHello"));
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 13d177d2b25..de17efafcf2 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -34,56 +34,6 @@
 
 #else /* __KERNEL_GPU__ */
 
-#  ifdef __KERNEL_OPENCL__
-
-/* Float atomics implementation credits:
- *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
- */
-ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
-                                                   const float operand)
-{
-  union {
-    unsigned int int_value;
-    float float_value;
-  } new_value;
-  union {
-    unsigned int int_value;
-    float float_value;
-  } prev_value;
-  do {
-    prev_value.float_value = *source;
-    new_value.float_value = prev_value.float_value + operand;
-  } while (atomic_cmpxchg((volatile ccl_global unsigned int *)source,
-                          prev_value.int_value,
-                          new_value.int_value) != prev_value.int_value);
-  return new_value.float_value;
-}
-
-ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
-                                                      const float old_val,
-                                                      const float new_val)
-{
-  union {
-    unsigned int int_value;
-    float float_value;
-  } new_value, prev_value, result;
-  prev_value.float_value = old_val;
-  new_value.float_value = new_val;
-  result.int_value = atomic_cmpxchg(
-      (volatile ccl_global unsigned int *)dest, prev_value.int_value, new_value.int_value);
-  return result.float_value;
-}
-
-#    define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
-#    define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
-#    define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
-#    define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
-
-#    define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
-#    define ccl_barrier(flags) barrier(flags)
-
-#  endif /* __KERNEL_OPENCL__ */
-
 #  ifdef __KERNEL_CUDA__
 
 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 74ecefa1917..1d598725c84 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -26,13 +26,7 @@
 CCL_NAMESPACE_BEGIN
 
 DebugFlags::CPU::CPU()
-    : avx2(true),
-      avx(true),
-      sse41(true),
-      sse3(true),
-      sse2(true),
-      bvh_layout(BVH_LAYOUT_AUTO),
-      split_kernel(false)
+    : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO)
 {
   reset();
 }
@@ -58,11 +52,9 @@ void DebugFlags::CPU::reset()
 #undef CHECK_CPU_FLAGS
 
   bvh_layout = BVH_LAYOUT_AUTO;
-
-  split_kernel = false;
 }
 
-DebugFlags::CUDA::CUDA() : adaptive_compile(false), split_kernel(false)
+DebugFlags::CUDA::CUDA() : adaptive_compile(false)
 {
   reset();
 }
@@ -71,8 +63,6 @@ void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
     adaptive_compile = true;
-
-  split_kernel = false;
 }
 
 DebugFlags::OptiX::OptiX()
@@ -82,42 +72,7 @@ DebugFlags::OptiX::OptiX()
 
 void DebugFlags::OptiX::reset()
 {
-  cuda_streams = 1;
-  curves_api = false;
-}
-
-DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
-{
-  reset();
-}
-
-void DebugFlags::OpenCL::reset()
-{
-  /* Initialize device type from environment variables. */
-  device_type = DebugFlags::OpenCL::DEVICE_ALL;
-  char *device = getenv("CYCLES_OPENCL_TEST");
-  if (device) {
-    if (strcmp(device, "NONE") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_NONE;
-    }
-    else if (strcmp(device, "ALL") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_ALL;
-    }
-    else if (strcmp(device, "DEFAULT") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
-    }
-    else if (strcmp(device, "CPU") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_CPU;
-    }
-    else if (strcmp(device, "GPU") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_GPU;
-    }
-    else if (strcmp(device, "ACCELERATOR") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
-    }
-  }
-  /* Initialize other flags from environment variables. */
-  debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
+  use_debug = false;
 }
 
 DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false)
@@ -131,7 +86,6 @@ void DebugFlags::reset()
   cpu.reset();
   cuda.reset();
   optix.reset();
-  opencl.reset();
 }
 
 std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
@@ -142,40 +96,13 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
      << "  SSE4.1     : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
      << "  SSE3       : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
      << "  SSE2       : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
-     << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"
-     << "  Split      : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
+     << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n";
 
   os << "CUDA flags:\n"
      << "  Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
 
   os << "OptiX flags:\n"
-     << "  CUDA streams : " << debug_flags.optix.cuda_streams << "\n";
-
-  const char *opencl_device_type;
-  switch (debug_flags.opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      opencl_device_type = "NONE";
-      break;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      opencl_device_type = "ALL";
-      break;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      opencl_device_type = "DEFAULT";
-      break;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      opencl_device_type = "CPU";
-      break;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      opencl_device_type = "GPU";
-      break;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      opencl_device_type = "ACCELERATOR";
-      break;
-  }
-  os << "OpenCL flags:\n"
-     << "  Device type    : " << opencl_device_type << "\n"
-     << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
-     << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
+     << "  Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
   return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index f7e53f90f74..99e2723180c 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -79,9 +79,6 @@ class DebugFlags {
      * CPUs and GPUs can be selected here instead.
      */
     BVHLayout bvh_layout;
-
-    /* Whether split kernel is used */
-    bool split_kernel;
   };
 
   /* Descriptor of CUDA feature-set to be used. */
@@ -94,9 +91,6 @@ class DebugFlags {
     /* Whether adaptive feature based runtime compile is enabled or not.
      * Requires the CUDA Toolkit and only works on Linux atm. */
     bool adaptive_compile;
-
-    /* Whether split kernel is used */
-    bool split_kernel;
   };
 
   /* Descriptor of OptiX feature-set to be used. */
@@ -106,61 +100,9 @@ class DebugFlags {
     /* Reset flags to their defaults. */
     void reset();
 
-    /* Number of CUDA streams to launch kernels concurrently from. */
-    int cuda_streams;
-
-    /* Use OptiX curves API for hair instead of custom implementation. */
-    bool curves_api;
-  };
-
-  /* Descriptor of OpenCL feature-set to be used. */
-  struct OpenCL {
-    OpenCL();
-
-    /* Reset flags to their defaults. */
-    void reset();
-
-    /* Available device types.
-     * Only gives a hint which devices to let user to choose from, does not
-     * try to use any sort of optimal device or so.
-     */
-    enum DeviceType {
-      /* None of OpenCL devices will be used. */
-      DEVICE_NONE,
-      /* All OpenCL devices will be used. */
-      DEVICE_ALL,
-      /* Default system OpenCL device will be used. */
-      DEVICE_DEFAULT,
-      /* Host processor will be used. */
-      DEVICE_CPU,
-      /* GPU devices will be used. */
-      DEVICE_GPU,
-      /* Dedicated OpenCL accelerator device will be used. */
-      DEVICE_ACCELERATOR,
-    };
-
-    /* Available kernel types. */
-    enum KernelType {
-      /* Do automated guess which kernel to use, based on the officially
-       * supported GPUs and such.
-       */
-      KERNEL_DEFAULT,
-      /* Force mega kernel to be used. */
-      KERNEL_MEGA,
-      /* Force split kernel to be used. */
-      KERNEL_SPLIT,
-    };
-
-    /* Requested device type. */
-    DeviceType device_type;
-
-    /* Use debug version of the kernel. */
-    bool debug;
-
-    /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all
-     * devices. */
-    /* Artificial memory limit in bytes (0 if disabled). */
-    size_t mem_limit;
+    /* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable
+     * validations, and lower optimization level. */
+    bool use_debug;
   };
 
   /* Get instance of debug flags registry. */
@@ -182,9 +124,6 @@ class DebugFlags {
   /* Requested OptiX flags. */
   OptiX optix;
 
-  /* Requested OpenCL flags. */
-  OpenCL opencl;
-
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 0a239a944a5..9b1698d461a 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -43,9 +43,9 @@
 #  define ccl_local_param
 #  define ccl_private
 #  define ccl_restrict __restrict
-#  define ccl_ref &
 #  define ccl_optional_struct_init
 #  define ccl_loop_no_unroll
+#  define ccl_attr_maybe_unused [[maybe_unused]]
 #  define __KERNEL_WITH_SSE_ALIGN__
 
 #  if defined(_WIN32) && !defined(FREE_WINDOWS)
@@ -62,7 +62,6 @@
 #    define ccl_may_alias
 #    define ccl_always_inline __forceinline
 #    define ccl_never_inline __declspec(noinline)
-#    define ccl_maybe_unused
 #  else /* _WIN32 && !FREE_WINDOWS */
 #    define ccl_device_inline static inline __attribute__((always_inline))
 #    define ccl_device_forceinline static inline __attribute__((always_inline))
@@ -74,7 +73,6 @@
 #    define ccl_may_alias __attribute__((__may_alias__))
 #    define ccl_always_inline __attribute__((always_inline))
 #    define ccl_never_inline __attribute__((noinline))
-#    define ccl_maybe_unused __attribute__((used))
 #  endif /* _WIN32 && !FREE_WINDOWS */
 
 /* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index a8d4ee75e20..d9edfec5da3 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -28,14 +28,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Half Floats */
 
-#ifdef __KERNEL_OPENCL__
-
-#  define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h);
-
-#else
-
 /* CUDA has its own half data type, no need to define then */
-#  ifndef __KERNEL_CUDA__
+#ifndef __KERNEL_CUDA__
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -59,27 +53,27 @@ class half {
  private:
   unsigned short v;
 };
-#  endif
+#endif
 
 struct half4 {
   half x, y, z, w;
 };
 
-#  ifdef __KERNEL_CUDA__
+#ifdef __KERNEL_CUDA__
 
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
 {
-  h[0] = __float2half(f.x * scale);
-  h[1] = __float2half(f.y * scale);
-  h[2] = __float2half(f.z * scale);
-  h[3] = __float2half(f.w * scale);
+  h[0] = __float2half(f.x);
+  h[1] = __float2half(f.y);
+  h[2] = __float2half(f.z);
+  h[3] = __float2half(f.w);
 }
 
-#  else
+#else
 
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
 {
-#    ifndef __KERNEL_SSE2__
+#  ifndef __KERNEL_SSE2__
   for (int i = 0; i < 4; i++) {
     /* optimized float to half for pixels:
      * assumes no negative, no nan, no inf, and sets denormal to 0 */
@@ -87,8 +81,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
       uint i;
       float f;
     } in;
-    float fscale = f[i] * scale;
-    in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f;
+    in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f;
     int x = in.i;
 
     int absolute = x & 0x7FFFFFFF;
@@ -98,23 +91,22 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 
     h[i] = (rshift & 0x7FFF);
   }
-#    else
+#  else
   /* same as above with SSE */
-  ssef fscale = load4f(f) * scale;
-  ssef x = min(max(fscale, 0.0f), 65504.0f);
+  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
 
-#      ifdef __KERNEL_AVX2__
+#    ifdef __KERNEL_AVX2__
   ssei rpack = _mm_cvtps_ph(x, 0);
-#      else
+#    else
   ssei absolute = cast(x) & 0x7FFFFFFF;
   ssei Z = absolute + 0xC8000000;
   ssei result = andnot(absolute < 0x38800000, Z);
   ssei rshift = (result >> 13) & 0x7FFF;
   ssei rpack = _mm_packs_epi32(rshift, rshift);
-#      endif
+#    endif
 
   _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack));
-#    endif
+#  endif
 }
 
 ccl_device_inline float half_to_float(half h)
@@ -160,8 +152,6 @@ ccl_device_inline half float_to_half(float f)
   return (value_bits | sign_bit);
 }
 
-#  endif
-
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index c161299acd0..35c2d436d09 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -49,6 +49,7 @@ class LogMessageVoidify {
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
 #  define VLOG_IF(severity, condition) LOG_SUPPRESS()
+#  define VLOG_IS_ON(severity) false
 
 #  define CHECK(expression) LOG_SUPPRESS()
 
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index c5996ebfcb6..6d728dde679 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -26,11 +26,9 @@
 #  include <cmath>
 #endif
 
-#ifndef __KERNEL_OPENCL__
-#  include <float.h>
-#  include <math.h>
-#  include <stdio.h>
-#endif /* __KERNEL_OPENCL__ */
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
 
 #include "util/util_types.h"
 
@@ -86,7 +84,6 @@ CCL_NAMESPACE_BEGIN
 /* Scalar */
 
 #ifdef _WIN32
-#  ifndef __KERNEL_OPENCL__
 ccl_device_inline float fmaxf(float a, float b)
 {
   return (a > b) ? a : b;
@@ -96,8 +93,7 @@ ccl_device_inline float fminf(float a, float b)
 {
   return (a < b) ? a : b;
 }
-#  endif /* !__KERNEL_OPENCL__ */
-#endif   /* _WIN32 */
+#endif /* _WIN32 */
 
 #ifndef __KERNEL_GPU__
 using std::isfinite;
@@ -119,6 +115,11 @@ ccl_device_inline int min(int a, int b)
   return (a < b) ? a : b;
 }
 
+ccl_device_inline uint min(uint a, uint b)
+{
+  return (a < b) ? a : b;
+}
+
 ccl_device_inline float max(float a, float b)
 {
   return (a > b) ? a : b;
@@ -166,7 +167,6 @@ ccl_device_inline float max4(float a, float b, float c, float d)
   return max(max(a, b), max(c, d));
 }
 
-#ifndef __KERNEL_OPENCL__
 /* Int/Float conversion */
 
 ccl_device_inline int as_int(uint i)
@@ -241,24 +241,23 @@ ccl_device_inline float __uint_as_float(uint i)
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(f.m128));
-#  else
+#else
   return make_int4(
       __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 __int4_as_float4(int4 i)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_castsi128_ps(i.m128));
-#  else
+#else
   return make_float4(
       __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#  endif
+#endif
 }
-#endif /* __KERNEL_OPENCL__ */
 
 /* Versions of functions which are safe for fast math. */
 ccl_device_inline bool isnan_safe(float f)
@@ -279,7 +278,6 @@ ccl_device_inline float ensure_finite(float v)
   return isfinite_safe(v) ? v : 0.0f;
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int clamp(int a, int mn, int mx)
 {
   return min(max(a, mn), mx);
@@ -309,8 +307,6 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
   return result;
 }
 
-#endif /* __KERNEL_OPENCL__ */
-
 #ifndef __KERNEL_CUDA__
 ccl_device_inline float saturate(float a)
 {
@@ -451,7 +447,6 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_OPENCL__
 /* Interpolation */
 
 template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -459,15 +454,9 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
   return (A)(a * ((B)1 - t) + b * t);
 }
 
-#endif /* __KERNEL_OPENCL__ */
-
 /* Triangle */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float triangle_area(const float3 &v1, const float3 &v2, const float3 &v3)
-#else
-ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
-#endif
 {
   return len(cross(v3 - v2, v1 - v2)) * 0.5f;
 }
@@ -665,11 +654,7 @@ ccl_device_inline float pow22(float a)
 
 ccl_device_inline float beta(float x, float y)
 {
-#ifndef __KERNEL_OPENCL__
   return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
-#else
-  return expf(lgamma(x) + lgamma(y) - lgamma(x + y));
-#endif
 }
 
 ccl_device_inline float xor_signmask(float x, int y)
@@ -686,8 +671,6 @@ ccl_device_inline uint count_leading_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return __clz(x);
-#elif defined(__KERNEL_OPENCL__)
-  return clz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -704,8 +687,6 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return (__ffs(x) - 1);
-#elif defined(__KERNEL_OPENCL__)
-  return (31 - count_leading_zeros(x & -x));
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -722,8 +703,6 @@ ccl_device_inline uint find_first_set(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return __ffs(x);
-#elif defined(__KERNEL_OPENCL__)
-  return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
 #else
 #  ifdef _MSC_VER
   return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -797,6 +776,52 @@ ccl_device_inline float precise_angle(float3 a, float3 b)
   return 2.0f * atan2f(len(a - b), len(a + b));
 }
 
+/* Return value which is greater than the given one and is a power of two. */
+ccl_device_inline uint next_power_of_two(uint x)
+{
+  return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
+}
+
+/* Return value which is lower than the given one and is a power of two. */
+ccl_device_inline uint prev_power_of_two(uint x)
+{
+  return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
+}
+
+#ifndef __has_builtin
+#  define __has_builtin(v) 0
+#endif
+
+/* Reverses the bits of a 32 bit integer. */
+ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
+{
+  /* Use a native instruction if it exists. */
+#if defined(__arm__) || defined(__aarch64__)
+  __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
+  return x;
+#elif defined(__KERNEL_CUDA__)
+  return __brev(x);
+#elif __has_builtin(__builtin_bitreverse32)
+  return __builtin_bitreverse32(x);
+#else
+  /* Flip pairwise. */
+  x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
+  /* Flip pairs. */
+  x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
+  /* Flip nibbles. */
+  x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
+  /* Flip bytes. CPUs have an instruction for that, pretty fast one. */
+#  ifdef _MSC_VER
+  return _byteswap_ulong(x);
+#  elif defined(__INTEL_COMPILER)
+  return (uint32_t)_bswap((int)x);
+#  else
+  /* Assuming gcc or clang. */
+  return __builtin_bswap32(x);
+#  endif
+#endif
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
index 17f6f3c9382..70b80c33544 100644
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float2 operator-(const float2 &a);
 ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
 ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -64,7 +63,6 @@ ccl_device_inline float2 fabs(const float2 &a);
 ccl_device_inline float2 as_float2(const float4 &a);
 ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
 ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
 
@@ -82,7 +80,6 @@ ccl_device_inline float2 one_float2()
   return make_float2(1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float2 operator-(const float2 &a)
 {
   return make_float2(-a.x, -a.y);
@@ -262,8 +259,6 @@ ccl_device_inline float2 floor(const float2 &a)
   return make_float2(floorf(a.x), floorf(a.y));
 }
 
-#endif /* !__KERNEL_OPENCL__ */
-
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 9673c043189..30a1b4c3f77 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float3 operator-(const float3 &a);
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
 ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -63,7 +62,6 @@ ccl_device_inline float3 rcp(const float3 &a);
 ccl_device_inline float3 sqrt(const float3 &a);
 ccl_device_inline float3 floor(const float3 &a);
 ccl_device_inline float3 ceil(const float3 &a);
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float min3(float3 a);
 ccl_device_inline float max3(float3 a);
@@ -105,50 +103,49 @@ ccl_device_inline float3 one_float3()
   return make_float3(1.0f, 1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float3 operator-(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#  else
+#else
   return make_float3(-a.x, -a.y, -a.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float f)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-#  else
+#else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float f, const float3 &a)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#  else
+#else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator/(const float f, const float3 &a)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-#  else
+#else
   return make_float3(f / a.x, f / a.y, f / a.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -159,11 +156,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -173,11 +170,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
 
 ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_add_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -187,11 +184,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
 
 ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_sub_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -227,11 +224,11 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
 
 ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#  else
+#else
   return (a.x == b.x && a.y == b.y && a.z == b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -246,20 +243,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
 
 ccl_device_inline float dot(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#  else
+#else
   return a.x * b.x + a.y * b.y + a.z * b.z;
-#  endif
+#endif
 }
 
 ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#  else
+#else
   return a.x * b.x + a.y * b.y;
-#  endif
+#endif
 }
 
 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -270,30 +267,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 
 ccl_device_inline float3 normalize(const float3 &a)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
   return float3(_mm_div_ps(a.m128, norm));
-#  else
+#else
   return a / len(a);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 min(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_min_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 max(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_max_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -303,43 +300,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 
 ccl_device_inline float3 fabs(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
-#    ifdef __KERNEL_NEON__
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
   return float3(vabsq_f32(a.m128));
-#    else
+#  else
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
   return float3(_mm_and_ps(a.m128, mask));
-#    endif
-#  else
-  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
 #  endif
+#else
+  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
 }
 
 ccl_device_inline float3 sqrt(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_sqrt_ps(a));
-#  else
+#else
   return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 floor(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_floor_ps(a));
-#  else
+#else
   return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 ceil(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_ceil_ps(a));
-#  else
+#else
   return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -349,14 +346,13 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
 
 ccl_device_inline float3 rcp(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#  else
+#else
   return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-#  endif
+#endif
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float min3(float3 a)
 {
@@ -483,11 +479,7 @@ ccl_device_inline float average(const float3 a)
 
 ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 {
-#ifdef __KERNEL_OPENCL__
-  return all(a == b);
-#else
   return a == b;
-#endif
 }
 
 ccl_device_inline float3 pow3(float3 v, float e)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 0ba2bafa2f0..19af5c8c638 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float4 operator-(const float4 &a);
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
 ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -66,7 +65,6 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 ccl_device_inline float4 fabs(const float4 &a);
 ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_OPENCL__*/
 
 ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
 
@@ -112,33 +110,32 @@ ccl_device_inline float4 one_float4()
   return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float4 operator-(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
   return float4(_mm_xor_ps(a.m128, mask));
-#  else
+#else
   return make_float4(-a.x, -a.y, -a.z, -a.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_mul_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, float f)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return a * make_float4(f);
-#  else
+#else
   return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -153,11 +150,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
 
 ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_div_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -167,11 +164,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
 
 ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_add_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -181,11 +178,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
 
 ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_sub_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -215,38 +212,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
 
 ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#  else
+#else
   return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -256,16 +253,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   __m128 t = vmulq_f32(a, b);
   return vaddvq_f32(t);
-#    else
-  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#    endif
 #  else
-  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
 #  endif
+#else
+  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#endif
 }
 
 ccl_device_inline float len_squared(const float4 &a)
@@ -275,21 +272,21 @@ ccl_device_inline float len_squared(const float4 &a)
 
 ccl_device_inline float4 rcp(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#  else
+#else
   return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 sqrt(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_sqrt_ps(a.m128));
-#  else
+#else
   return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 sqr(const float4 &a)
@@ -299,39 +296,39 @@ ccl_device_inline float4 sqr(const float4 &a)
 
 ccl_device_inline float4 cross(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
          (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#  else
+#else
   return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#  endif
+#endif
 }
 
 ccl_device_inline bool is_zero(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return a == make_float4(0.0f);
-#  else
+#else
   return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   return float4(vdupq_n_f32(vaddvq_f32(a)));
-#    elif defined(__KERNEL_SSE3__)
+#  elif defined(__KERNEL_SSE3__)
   float4 h(_mm_hadd_ps(a.m128, a.m128));
   return float4(_mm_hadd_ps(h.m128, h.m128));
-#    else
+#  else
   float4 h(shuffle<1, 0, 3, 2>(a) + a);
   return shuffle<2, 3, 0, 1>(h) + h;
-#    endif
-#  else
+#  endif
+#else
   float sum = (a.x + a.y) + (a.z + a.w);
   return make_float4(sum, sum, sum, sum);
-#  endif
+#endif
 }
 
 ccl_device_inline float average(const float4 &a)
@@ -357,20 +354,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
 
 ccl_device_inline float4 min(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_min_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 max(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_max_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -380,24 +377,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 
 ccl_device_inline float4 fabs(const float4 &a)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   return float4(vabsq_f32(a));
-#    else
-  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#    endif
 #  else
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
 #  endif
+#else
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#endif
 }
 
 ccl_device_inline float4 floor(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_floor_ps(a));
-#  else
+#else
   return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -405,8 +402,6 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
   return a + t * (b - a);
 }
 
-#endif /* !__KERNEL_OPENCL__*/
-
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
index 0295cd51f7e..5782b878801 100644
--- a/intern/cycles/util/util_math_int2.h
+++ b/intern/cycles/util/util_math_int2.h
@@ -27,20 +27,17 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline bool operator==(const int2 a, const int2 b);
 ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
 ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_OPENCL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
   return (a.x == b.x && a.y == b.y);
@@ -70,7 +67,6 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
 {
   return make_int2(a.x / b.x, a.y / b.y);
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
index d92ed895dc2..e0dfae7c015 100644
--- a/intern/cycles/util/util_math_int3.h
+++ b/intern/cycles/util/util_math_int3.h
@@ -27,52 +27,49 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int3 min(int3 a, int3 b);
 ccl_device_inline int3 max(int3 a, int3 b);
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /* !__KERNEL_OPENCL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int3 min(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_min_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 max(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_max_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return min(max(a, make_int3(mn)), make_int3(mx));
-#  else
+#else
   return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return min(max(a, mn), make_int3(mx));
-#  else
+#else
   return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -92,22 +89,21 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
 
 ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int3(_mm_add_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int3(_mm_sub_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-#  endif
+#endif
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 8905c8bc7f0..c78f4615013 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -66,6 +66,7 @@ typedef struct stat path_stat_t;
 
 static string cached_path = "";
 static string cached_user_path = "";
+static string cached_temp_path = "";
 static string cached_xdg_cache_path = "";
 
 namespace {
@@ -335,10 +336,11 @@ static string path_xdg_cache_get()
 }
 #endif
 
-void path_init(const string &path, const string &user_path)
+void path_init(const string &path, const string &user_path, const string &temp_path)
 {
   cached_path = path;
   cached_user_path = user_path;
+  cached_temp_path = temp_path;
 
 #ifdef _MSC_VER
   // workaround for https://svn.boost.org/trac/boost/ticket/6320
@@ -382,6 +384,15 @@ string path_cache_get(const string &sub)
 #endif
 }
 
+string path_temp_get(const string &sub)
+{
+  if (cached_temp_path == "") {
+    cached_temp_path = Filesystem::temp_directory_path();
+  }
+
+  return path_join(cached_temp_path, sub);
+}
+
 #if defined(__linux__) || defined(__APPLE__)
 string path_xdg_home_get(const string &sub = "");
 #endif
@@ -739,177 +750,6 @@ bool path_remove(const string &path)
   return remove(path.c_str()) == 0;
 }
 
-struct SourceReplaceState {
-  typedef map<string, string> ProcessedMapping;
-  /* Base director for all relative include headers. */
-  string base;
-  /* Result of processed files. */
-  ProcessedMapping processed_files;
-  /* Set of files which are considered "precompiled" and which are replaced
-   * with and empty string on a subsequent occurrence in include statement.
-   */
-  set<string> precompiled_headers;
-};
-
-static string path_source_replace_includes_recursive(const string &source,
-                                                     const string &source_filepath,
-                                                     SourceReplaceState *state);
-
-static string line_directive(const SourceReplaceState &state, const string &path, const int line)
-{
-  string unescaped_path = path;
-  /* First we make path relative. */
-  if (string_startswith(unescaped_path, state.base.c_str())) {
-    const string base_file = path_filename(state.base);
-    const size_t base_len = state.base.length();
-    unescaped_path = base_file +
-                     unescaped_path.substr(base_len, unescaped_path.length() - base_len);
-  }
-  /* Second, we replace all unsafe characters. */
-  const size_t length = unescaped_path.length();
-  string escaped_path = "";
-  for (size_t i = 0; i < length; ++i) {
-    const char ch = unescaped_path[i];
-    if (strchr("\"\'\?\\", ch) != NULL) {
-      escaped_path += "\\";
-    }
-    escaped_path += ch;
-  }
-  /* TODO(sergey): Check whether using std::to_string combined with several
-   * concatenation operations is any faster.
-   */
-  return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
-}
-
-static string path_source_handle_preprocessor(const string &preprocessor_line,
-                                              const string &source_filepath,
-                                              const size_t line_number,
-                                              SourceReplaceState *state)
-{
-  string result = preprocessor_line;
-  string token = string_strip(preprocessor_line.substr(1, preprocessor_line.size() - 1));
-  if (string_startswith(token, "include")) {
-    token = string_strip(token.substr(7, token.size() - 7));
-    if (token[0] == '"') {
-      const size_t n_start = 1;
-      const size_t n_end = token.find("\"", n_start);
-      const string filename = token.substr(n_start, n_end - n_start);
-      const bool is_precompiled = string_endswith(token, "// PRECOMPILED");
-      string filepath = path_join(state->base, filename);
-      if (!path_exists(filepath)) {
-        filepath = path_join(path_dirname(source_filepath), filename);
-      }
-      if (is_precompiled) {
-        state->precompiled_headers.insert(filepath);
-      }
-      string text;
-      if (path_read_text(filepath, text)) {
-        text = path_source_replace_includes_recursive(text, filepath, state);
-        /* Use line directives for better error messages. */
-        result = line_directive(*state, filepath, 1) + "\n" + text + "\n" +
-                 line_directive(*state, source_filepath, line_number + 1);
-      }
-    }
-  }
-  return result;
-}
-
-/* Our own little c preprocessor that replaces #includes with the file
- * contents, to work around issue of OpenCL drivers not supporting
- * include paths with spaces in them.
- */
-static string path_source_replace_includes_recursive(const string &source,
-                                                     const string &source_filepath,
-                                                     SourceReplaceState *state)
-{
-  /* Try to re-use processed file without spending time on replacing all
-   * include directives again.
-   */
-  SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find(
-      source_filepath);
-  if (replaced_file != state->processed_files.end()) {
-    if (state->precompiled_headers.find(source_filepath) != state->precompiled_headers.end()) {
-      return "";
-    }
-    return replaced_file->second;
-  }
-  /* Perform full file processing. */
-  string result = "";
-  const size_t source_length = source.length();
-  size_t index = 0;
-  /* Information about where we are in the source. */
-  size_t line_number = 0, column_number = 1;
-  /* Currently gathered non-preprocessor token.
-   * Store as start/length rather than token itself to avoid overhead of
-   * memory re-allocations on each character concatenation.
-   */
-  size_t token_start = 0, token_length = 0;
-  /* Denotes whether we're inside of preprocessor line, together with
-   * preprocessor line itself.
-   *
-   * TODO(sergey): Investigate whether using token start/end position
-   * gives measurable speedup.
-   */
-  bool inside_preprocessor = false;
-  string preprocessor_line = "";
-  /* Actual loop over the whole source. */
-  while (index < source_length) {
-    const char ch = source[index];
-    if (ch == '\n') {
-      if (inside_preprocessor) {
-        result += path_source_handle_preprocessor(
-            preprocessor_line, source_filepath, line_number, state);
-        /* Start gathering net part of the token. */
-        token_start = index;
-        token_length = 0;
-      }
-      inside_preprocessor = false;
-      preprocessor_line = "";
-      column_number = 0;
-      ++line_number;
-    }
-    else if (ch == '#' && column_number == 1 && !inside_preprocessor) {
-      /* Append all possible non-preprocessor token to the result. */
-      if (token_length != 0) {
-        result.append(source, token_start, token_length);
-        token_start = index;
-        token_length = 0;
-      }
-      inside_preprocessor = true;
-    }
-    if (inside_preprocessor) {
-      preprocessor_line += ch;
-    }
-    else {
-      ++token_length;
-    }
-    ++index;
-    ++column_number;
-  }
-  /* Append possible tokens which happened before special events handled
-   * above.
-   */
-  if (token_length != 0) {
-    result.append(source, token_start, token_length);
-  }
-  if (inside_preprocessor) {
-    result += path_source_handle_preprocessor(
-        preprocessor_line, source_filepath, line_number, state);
-  }
-  /* Store result for further reuse. */
-  state->processed_files[source_filepath] = result;
-  return result;
-}
-
-string path_source_replace_includes(const string &source,
-                                    const string &path,
-                                    const string &source_filename)
-{
-  SourceReplaceState state;
-  state.base = path;
-  return path_source_replace_includes_recursive(source, path_join(path, source_filename), &state);
-}
-
 FILE *path_fopen(const string &path, const string &mode)
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 7a83c2135a4..f899bc2e01c 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -32,9 +32,10 @@
 CCL_NAMESPACE_BEGIN
 
 /* program paths */
-void path_init(const string &path = "", const string &user_path = "");
+void path_init(const string &path = "", const string &user_path = "", const string &tmp_path = "");
 string path_get(const string &sub = "");
 string path_user_get(const string &sub = "");
+string path_temp_get(const string &sub = "");
 string path_cache_get(const string &sub = "");
 
 /* path string manipulation */
@@ -65,11 +66,6 @@ bool path_read_text(const string &path, string &text);
 /* File manipulation. */
 bool path_remove(const string &path);
 
-/* source code utility */
-string path_source_replace_includes(const string &source,
-                                    const string &path,
-                                    const string &source_filename = "");
-
 /* cache utility */
 void path_cache_clear_except(const string &name, const set<string> &except);
 
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
index 073b09f719f..5343f076e22 100644
--- a/intern/cycles/util/util_profiling.cpp
+++ b/intern/cycles/util/util_profiling.cpp
@@ -48,13 +48,7 @@ void Profiler::run()
       }
 
       if (cur_shader >= 0 && cur_shader < shader_samples.size()) {
-        /* Only consider the active shader during events whose runtime significantly depends on it.
-         */
-        if (((cur_event >= PROFILING_SHADER_EVAL) && (cur_event <= PROFILING_SUBSURFACE)) ||
-            ((cur_event >= PROFILING_CLOSURE_EVAL) &&
-             (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
-          shader_samples[cur_shader]++;
-        }
+        shader_samples[cur_shader]++;
       }
 
       if (cur_object >= 0 && cur_object < object_samples.size()) {
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
index ceec08ed894..96bb682c50e 100644
--- a/intern/cycles/util/util_profiling.h
+++ b/intern/cycles/util/util_profiling.h
@@ -28,38 +28,30 @@ CCL_NAMESPACE_BEGIN
 enum ProfilingEvent : uint32_t {
   PROFILING_UNKNOWN,
   PROFILING_RAY_SETUP,
-  PROFILING_PATH_INTEGRATE,
-  PROFILING_SCENE_INTERSECT,
-  PROFILING_INDIRECT_EMISSION,
-  PROFILING_VOLUME,
-  PROFILING_SHADER_SETUP,
-  PROFILING_SHADER_EVAL,
-  PROFILING_SHADER_APPLY,
-  PROFILING_AO,
-  PROFILING_SUBSURFACE,
-  PROFILING_CONNECT_LIGHT,
-  PROFILING_SURFACE_BOUNCE,
-  PROFILING_WRITE_RESULT,
-
-  PROFILING_INTERSECT,
-  PROFILING_INTERSECT_LOCAL,
-  PROFILING_INTERSECT_SHADOW_ALL,
-  PROFILING_INTERSECT_VOLUME,
-  PROFILING_INTERSECT_VOLUME_ALL,
-
-  PROFILING_CLOSURE_EVAL,
-  PROFILING_CLOSURE_SAMPLE,
-  PROFILING_CLOSURE_VOLUME_EVAL,
-  PROFILING_CLOSURE_VOLUME_SAMPLE,
-
-  PROFILING_DENOISING,
-  PROFILING_DENOISING_CONSTRUCT_TRANSFORM,
-  PROFILING_DENOISING_RECONSTRUCT,
-  PROFILING_DENOISING_DIVIDE_SHADOW,
-  PROFILING_DENOISING_NON_LOCAL_MEANS,
-  PROFILING_DENOISING_COMBINE_HALVES,
-  PROFILING_DENOISING_GET_FEATURE,
-  PROFILING_DENOISING_DETECT_OUTLIERS,
+
+  PROFILING_INTERSECT_CLOSEST,
+  PROFILING_INTERSECT_SUBSURFACE,
+  PROFILING_INTERSECT_SHADOW,
+  PROFILING_INTERSECT_VOLUME_STACK,
+
+  PROFILING_SHADE_SURFACE_SETUP,
+  PROFILING_SHADE_SURFACE_EVAL,
+  PROFILING_SHADE_SURFACE_DIRECT_LIGHT,
+  PROFILING_SHADE_SURFACE_INDIRECT_LIGHT,
+  PROFILING_SHADE_SURFACE_AO,
+  PROFILING_SHADE_SURFACE_PASSES,
+
+  PROFILING_SHADE_VOLUME_SETUP,
+  PROFILING_SHADE_VOLUME_INTEGRATE,
+  PROFILING_SHADE_VOLUME_DIRECT_LIGHT,
+  PROFILING_SHADE_VOLUME_INDIRECT_LIGHT,
+
+  PROFILING_SHADE_SHADOW_SETUP,
+  PROFILING_SHADE_SHADOW_SURFACE,
+  PROFILING_SHADE_SHADOW_VOLUME,
+
+  PROFILING_SHADE_LIGHT_SETUP,
+  PROFILING_SHADE_LIGHT_EVAL,
 
   PROFILING_NUM_EVENTS,
 };
@@ -136,37 +128,51 @@ class ProfilingHelper {
     state->event = event;
   }
 
+  ~ProfilingHelper()
+  {
+    state->event = previous_event;
+  }
+
   inline void set_event(ProfilingEvent event)
   {
     state->event = event;
   }
 
-  inline void set_shader(int shader)
+ protected:
+  ProfilingState *state;
+  uint32_t previous_event;
+};
+
+class ProfilingWithShaderHelper : public ProfilingHelper {
+ public:
+  ProfilingWithShaderHelper(ProfilingState *state, ProfilingEvent event)
+      : ProfilingHelper(state, event)
   {
-    state->shader = shader;
-    if (state->active) {
-      assert(shader < state->shader_hits.size());
-      state->shader_hits[shader]++;
-    }
   }
 
-  inline void set_object(int object)
+  ~ProfilingWithShaderHelper()
   {
-    state->object = object;
-    if (state->active) {
-      assert(object < state->object_hits.size());
-      state->object_hits[object]++;
-    }
+    state->object = -1;
+    state->shader = -1;
   }
 
-  ~ProfilingHelper()
+  inline void set_shader(int object, int shader)
   {
-    state->event = previous_event;
+    if (state->active) {
+      state->shader = shader;
+      state->object = object;
+
+      if (shader >= 0) {
+        assert(shader < state->shader_hits.size());
+        state->shader_hits[shader]++;
+      }
+
+      if (object >= 0) {
+        assert(object < state->object_hits.size());
+        state->object_hits[object]++;
+      }
+    }
   }
-
- private:
-  ProfilingState *state;
-  uint32_t previous_event;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 26534a29dfe..dca8d3d0ab5 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -46,7 +46,6 @@ class Progress {
     substatus = "";
     sync_status = "";
     sync_substatus = "";
-    kernel_status = "";
     update_cb = function_null;
     cancel = false;
     cancel_message = "";
@@ -87,7 +86,6 @@ class Progress {
     substatus = "";
     sync_status = "";
     sync_substatus = "";
-    kernel_status = "";
     cancel = false;
     cancel_message = "";
     error = false;
@@ -316,24 +314,6 @@ class Progress {
     }
   }
 
-  /* kernel status */
-
-  void set_kernel_status(const string &kernel_status_)
-  {
-    {
-      thread_scoped_lock lock(progress_mutex);
-      kernel_status = kernel_status_;
-    }
-
-    set_update();
-  }
-
-  void get_kernel_status(string &kernel_status_)
-  {
-    thread_scoped_lock lock(progress_mutex);
-    kernel_status_ = kernel_status;
-  }
-
   /* callback */
 
   void set_update()
@@ -378,8 +358,6 @@ class Progress {
   string sync_status;
   string sync_substatus;
 
-  string kernel_status;
-
   volatile bool cancel;
   string cancel_message;
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 8e8caa98a1b..b4a153c329f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -61,14 +61,14 @@ static struct TrueTy {
   {
     return true;
   }
-} True ccl_maybe_unused;
+} True ccl_attr_maybe_unused;
 
 static struct FalseTy {
   __forceinline operator bool() const
   {
     return false;
   }
-} False ccl_maybe_unused;
+} False ccl_attr_maybe_unused;
 
 static struct ZeroTy {
   __forceinline operator float() const
@@ -79,7 +79,7 @@ static struct ZeroTy {
   {
     return 0;
   }
-} zero ccl_maybe_unused;
+} zero ccl_attr_maybe_unused;
 
 static struct OneTy {
   __forceinline operator float() const
@@ -90,7 +90,7 @@ static struct OneTy {
   {
     return 1;
   }
-} one ccl_maybe_unused;
+} one ccl_attr_maybe_unused;
 
 static struct NegInfTy {
   __forceinline operator float() const
@@ -101,7 +101,7 @@ static struct NegInfTy {
   {
     return std::numeric_limits<int>::min();
   }
-} neg_inf ccl_maybe_unused;
+} neg_inf ccl_attr_maybe_unused;
 
 static struct PosInfTy {
   __forceinline operator float() const
@@ -112,10 +112,10 @@ static struct PosInfTy {
   {
     return std::numeric_limits<int>::max();
   }
-} inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
+} inf ccl_attr_maybe_unused, pos_inf ccl_attr_maybe_unused;
 
 static struct StepTy {
-} step ccl_maybe_unused;
+} step ccl_attr_maybe_unused;
 
 #endif
 
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index d809f2e06d7..7df52d462b7 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -24,9 +24,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC)
+#if defined(CYCLES_CUBIN_CC)
 #  define static_assert(statement, message)
-#endif /* __KERNEL_OPENCL__ */
+#endif
 
 #define static_assert_align(st, align) \
   static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 4dfebf14923..9c0b2ca50bb 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -17,6 +17,9 @@
 #include <stdarg.h>
 #include <stdio.h>
 
+#include <algorithm>
+#include <cctype>
+
 #include "util/util_foreach.h"
 #include "util/util_string.h"
 #include "util/util_windows.h"
@@ -107,24 +110,26 @@ void string_split(vector<string> &tokens,
   }
 }
 
-bool string_startswith(const string &s, const char *start)
+bool string_startswith(const string_view s, const string_view start)
 {
-  size_t len = strlen(start);
+  const size_t len = start.size();
 
-  if (len > s.size())
-    return 0;
-  else
-    return strncmp(s.c_str(), start, len) == 0;
+  if (len > s.size()) {
+    return false;
+  }
+
+  return strncmp(s.c_str(), start.data(), len) == 0;
 }
 
-bool string_endswith(const string &s, const string &end)
+bool string_endswith(const string_view s, const string_view end)
 {
-  size_t len = end.length();
+  const size_t len = end.size();
 
-  if (len > s.size())
-    return 0;
-  else
-    return s.compare(s.length() - len, len, end) == 0;
+  if (len > s.size()) {
+    return false;
+  }
+
+  return strncmp(s.c_str() + s.size() - len, end.data(), len) == 0;
 }
 
 string string_strip(const string &s)
@@ -172,6 +177,13 @@ string to_string(const char *str)
   return string(str);
 }
 
+string string_to_lower(const string &s)
+{
+  string r = s;
+  std::transform(r.begin(), r.end(), r.begin(), [](char c) { return std::tolower(c); });
+  return r;
+}
+
 /* Wide char strings helpers for Windows. */
 
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index f2272819b2f..55462cfd8b8 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -21,6 +21,11 @@
 #include <string.h>
 #include <string>
 
+/* Use string view implementation from OIIO.
+ * Ideally, need to switch to `std::string_view`, but this first requires getting rid of using
+ * namespace OIIO as it causes symbol collision. */
+#include <OpenImageIO/string_view.h>
+
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -31,6 +36,8 @@ using std::string;
 using std::stringstream;
 using std::to_string;
 
+using OIIO::string_view;
+
 #ifdef __GNUC__
 #  define PRINTF_ATTRIBUTE __attribute__((format(printf, 1, 2)))
 #else
@@ -45,12 +52,13 @@ void string_split(vector<string> &tokens,
                   const string &separators = "\t ",
                   bool skip_empty_tokens = true);
 void string_replace(string &haystack, const string &needle, const string &other);
-bool string_startswith(const string &s, const char *start);
-bool string_endswith(const string &s, const string &end);
+bool string_startswith(string_view s, string_view start);
+bool string_endswith(string_view s, string_view end);
 string string_strip(const string &s);
 string string_remove_trademark(const string &s);
 string string_from_bool(const bool var);
 string to_string(const char *str);
+string string_to_lower(const string &s);
 
 /* Wide char strings are only used on Windows to deal with non-ASCII
  * characters in file names and such. No reason to use such strings
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index b010881058b..be8c2fb505a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -403,4 +403,13 @@ size_t system_physical_ram()
 #endif
 }
 
+uint64_t system_self_process_id()
+{
+#ifdef _WIN32
+  return GetCurrentProcessId();
+#else
+  return getpid();
+#endif
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index c4db8b74339..a1797e6ca44 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -65,6 +65,9 @@ size_t system_physical_ram();
 /* Start a new process of the current application with the given arguments. */
 bool system_call_self(const vector<string> &args);
 
+/* Get identifier of the currently running process. */
+uint64_t system_self_process_id();
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_SYSTEM_H__ */
diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h
index 73e0f92d19c..8f84377ac8c 100644
--- a/intern/cycles/util/util_tbb.h
+++ b/intern/cycles/util/util_tbb.h
@@ -23,6 +23,7 @@
 
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
+#include <tbb/parallel_for_each.h>
 #include <tbb/task_arena.h>
 #include <tbb/task_group.h>
 
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index 71bf9c65911..4de66bf5f46 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -85,8 +85,6 @@ typedef struct TextureInfo {
   uint64_t data;
   /* Data Type */
   uint data_type;
-  /* Buffer number for OpenCL. */
-  uint cl_buffer;
   /* Interpolation and extension type. */
   uint interpolation, extension;
   /* Dimensions. */
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index f79eac4cbcf..e9cd3b0b483 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -498,36 +498,12 @@ Transform transform_from_viewplane(BoundBox2D &viewplane);
 
 #endif
 
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
 
-#ifdef __KERNEL_OPENCL__
-
-#  define OPENCL_TRANSFORM_ADDRSPACE_GLUE(a, b) a##b
-#  define OPENCL_TRANSFORM_ADDRSPACE_DECLARE(function) \
-    ccl_device_inline float3 OPENCL_TRANSFORM_ADDRSPACE_GLUE(function, _addrspace)( \
-        ccl_addr_space const Transform *t, const float3 a) \
-    { \
-      Transform private_tfm = *t; \
-      return function(&private_tfm, a); \
-    }
-
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_point)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed)
-
-#  undef OPENCL_TRANSFORM_ADDRSPACE_DECLARE
-#  undef OPENCL_TRANSFORM_ADDRSPACE_GLUE
-#  define transform_point_auto transform_point_addrspace
-#  define transform_direction_auto transform_direction_addrspace
-#  define transform_direction_transposed_auto transform_direction_transposed_addrspace
-#else
-#  define transform_point_auto transform_point
-#  define transform_direction_auto transform_direction
-#  define transform_direction_transposed_auto transform_direction_transposed
-#endif
+#define transform_point_auto transform_point
+#define transform_direction_auto transform_direction
+#define transform_direction_transposed_auto transform_direction_transposed
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 87358877e3c..442c32b3a3d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -17,9 +17,7 @@
 #ifndef __UTIL_TYPES_H__
 #define __UTIL_TYPES_H__
 
-#ifndef __KERNEL_OPENCL__
-#  include <stdlib.h>
-#endif
+#include <stdlib.h>
 
 /* Standard Integer Types */
 
@@ -44,18 +42,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Shorter Unsigned Names */
 
-#ifndef __KERNEL_OPENCL__
 typedef unsigned char uchar;
 typedef unsigned int uint;
 typedef unsigned short ushort;
-#endif
 
 /* Fixed Bits Types */
 
-#ifdef __KERNEL_OPENCL__
-typedef unsigned long uint64_t;
-#endif
-
 #ifndef __KERNEL_GPU__
 /* Generic Memory Pointer */
 
diff --git a/intern/cycles/util/util_unique_ptr.h b/intern/cycles/util/util_unique_ptr.h
index 3aaaf083eff..3181eafd43d 100644
--- a/intern/cycles/util/util_unique_ptr.h
+++ b/intern/cycles/util/util_unique_ptr.h
@@ -21,6 +21,7 @@
 
 CCL_NAMESPACE_BEGIN
 
+using std::make_unique;
 using std::unique_ptr;
 
 CCL_NAMESPACE_END
diff --git a/release/scripts/modules/rna_manual_reference.py b/release/scripts/modules/rna_manual_reference.py
index 0e3cb7e3cab..40f59307bec 100644
--- a/release/scripts/modules/rna_manual_reference.py
+++ b/release/scripts/modules/rna_manual_reference.py
@@ -209,7 +209,6 @@ url_manual_mapping = (
 	("bpy.types.toolsettings.use_proportional_connected*", "editors/3dview/controls/proportional_editing.html#bpy-types-toolsettings-use-proportional-connected"),
 	("bpy.types.toolsettings.use_proportional_projected*", "editors/3dview/controls/proportional_editing.html#bpy-types-toolsettings-use-proportional-projected"),
 	("bpy.types.view3doverlay.vertex_paint_mode_opacity*", "editors/3dview/display/overlays.html#bpy-types-view3doverlay-vertex-paint-mode-opacity"),
-	("bpy.types.viewlayer.use_pass_cryptomatte_accurate*", "render/layers/passes.html#bpy-types-viewlayer-use-pass-cryptomatte-accurate"),
 	("bpy.types.viewlayer.use_pass_cryptomatte_material*", "render/layers/passes.html#bpy-types-viewlayer-use-pass-cryptomatte-material"),
 	("bpy.ops.gpencil.vertex_color_brightness_contrast*", "grease_pencil/modes/vertex_paint/editing.html#bpy-ops-gpencil-vertex-color-brightness-contrast"),
 	("bpy.ops.view3d.edit_mesh_extrude_individual_move*", "modeling/meshes/editing/face/extrude_faces.html#bpy-ops-view3d-edit-mesh-extrude-individual-move"),
@@ -573,7 +572,6 @@ url_manual_mapping = (
 	("bpy.types.rendersettings.film_transparent*", "render/cycles/render_settings/film.html#bpy-types-rendersettings-film-transparent"),
 	("bpy.types.rendersettings.simplify_volumes*", "render/cycles/render_settings/simplify.html#bpy-types-rendersettings-simplify-volumes"),
 	("bpy.types.rendersettings.use_render_cache*", "render/output/properties/output.html#bpy-types-rendersettings-use-render-cache"),
-	("bpy.types.rendersettings.use_save_buffers*", "render/cycles/render_settings/performance.html#bpy-types-rendersettings-use-save-buffers"),
 	("bpy.types.rendersettings.use_single_layer*", "render/layers/view_layer.html#bpy-types-rendersettings-use-single-layer"),
 	("bpy.types.sceneeevee.use_taa_reprojection*", "render/eevee/render_settings/sampling.html#bpy-types-sceneeevee-use-taa-reprojection"),
 	("bpy.types.sequenceeditor.use_overlay_lock*", "video_editing/preview/sidebar.html#bpy-types-sequenceeditor-use-overlay-lock"),
diff --git a/release/scripts/presets/cycles/sampling/Final.py b/release/scripts/presets/cycles/sampling/Final.py
index f1222d927c1..f3626c4b778 100644
--- a/release/scripts/presets/cycles/sampling/Final.py
+++ b/release/scripts/presets/cycles/sampling/Final.py
@@ -1,18 +1,12 @@
 import bpy
 cycles = bpy.context.scene.cycles
 
-# Path Trace
-cycles.samples = 512
-cycles.preview_samples = 128
-
-# Branched Path Trace
-cycles.aa_samples = 128
-cycles.preview_aa_samples = 32
-
-cycles.diffuse_samples = 4
-cycles.glossy_samples = 4
-cycles.transmission_samples = 4
-cycles.ao_samples = 1
-cycles.mesh_light_samples = 4
-cycles.subsurface_samples = 4
-cycles.volume_samples = 4
+cycles.use_adaptive_sampling = True
+cycles.adaptive_threshold = 0.01
+cycles.samples = 4096
+cycles.adaptive_min_samples = 0
+cycles.time_limit = 0.0
+cycles.use_denoising = True
+cycles.denoiser = 'OPENIMAGEDENOISE'
+cycles.denoising_input_passes = 'RGB_ALBEDO_NORMAL'
+cycles.denoising_prefilter = 'ACCURATE'
diff --git a/release/scripts/presets/cycles/sampling/Preview.py b/release/scripts/presets/cycles/sampling/Preview.py
index c16449e2c8f..66aa9339063 100644
--- a/release/scripts/presets/cycles/sampling/Preview.py
+++ b/release/scripts/presets/cycles/sampling/Preview.py
@@ -1,18 +1,12 @@
 import bpy
 cycles = bpy.context.scene.cycles
 
-# Path Trace
-cycles.samples = 128
-cycles.preview_samples = 32
-
-# Branched Path Trace
-cycles.aa_samples = 32
-cycles.preview_aa_samples = 4
-
-cycles.diffuse_samples = 4
-cycles.glossy_samples = 4
-cycles.transmission_samples = 4
-cycles.ao_samples = 1
-cycles.mesh_light_samples = 4
-cycles.subsurface_samples = 4
-cycles.volume_samples = 4
+cycles.use_adaptive_sampling = True
+cycles.adaptive_threshold = 0.1
+cycles.samples = 1024
+cycles.adaptive_min_samples = 0
+cycles.time_limit = 0.0
+cycles.use_denoising = True
+cycles.denoiser = 'OPENIMAGEDENOISE'
+cycles.denoising_input_passes = 'RGB_ALBEDO_NORMAL'
+cycles.denoising_prefilter = 'ACCURATE'
diff --git a/release/scripts/presets/cycles/viewport_sampling/Final.py b/release/scripts/presets/cycles/viewport_sampling/Final.py
new file mode 100644
index 00000000000..b2cb6bfe90a
--- /dev/null
+++ b/release/scripts/presets/cycles/viewport_sampling/Final.py
@@ -0,0 +1,11 @@
+import bpy
+cycles = bpy.context.scene.cycles
+
+cycles.use_preview_adaptive_sampling = True
+cycles.preview_adaptive_threshold = 0.01
+cycles.preview_samples = 4096
+cycles.preview_adaptive_min_samples = 0
+cycles.use_preview_denoising = True
+cycles.preview_denoiser = 'OPENIMAGEDENOISE'
+cycles.preview_denoising_input_passes = 'RGB_ALBEDO_NORMAL'
+cycles.preview_denoising_prefilter = 'ACCURATE'
diff --git a/release/scripts/presets/cycles/viewport_sampling/Preview.py b/release/scripts/presets/cycles/viewport_sampling/Preview.py
new file mode 100644
index 00000000000..f8319b70d4a
--- /dev/null
+++ b/release/scripts/presets/cycles/viewport_sampling/Preview.py
@@ -0,0 +1,11 @@
+import bpy
+cycles = bpy.context.scene.cycles
+
+cycles.use_preview_adaptive_sampling = True
+cycles.preview_adaptive_threshold = 0.1
+cycles.preview_samples = 1024
+cycles.preview_adaptive_min_samples = 0
+cycles.use_preview_denoising = False
+cycles.preview_denoiser = 'AUTO'
+cycles.preview_denoising_input_passes = 'RGB_ALBEDO'
+cycles.preview_denoising_prefilter = 'FAST'
diff --git a/release/scripts/startup/bl_ui/properties_view_layer.py b/release/scripts/startup/bl_ui/properties_view_layer.py
index ad7d6008238..6b130d7353d 100644
--- a/release/scripts/startup/bl_ui/properties_view_layer.py
+++ b/release/scripts/startup/bl_ui/properties_view_layer.py
@@ -192,8 +192,6 @@ class ViewLayerCryptomattePanel(ViewLayerButtonsPanel, Panel):
                           view_layer.use_pass_cryptomatte_material,
                           view_layer.use_pass_cryptomatte_asset))
         col.prop(view_layer, "pass_cryptomatte_depth", text="Levels")
-        col.prop(view_layer, "use_pass_cryptomatte_accurate",
-                 text="Accurate Mode")
 
 
 class VIEWLAYER_PT_layer_passes_cryptomatte(ViewLayerCryptomattePanel, Panel):
diff --git a/source/blender/blenkernel/BKE_blender_version.h b/source/blender/blenkernel/BKE_blender_version.h
index b3ee2f411d7..1f25106404a 100644
--- a/source/blender/blenkernel/BKE_blender_version.h
+++ b/source/blender/blenkernel/BKE_blender_version.h
@@ -39,7 +39,7 @@ extern "C" {
 
 /* Blender file format version. */
 #define BLENDER_FILE_VERSION BLENDER_VERSION
-#define BLENDER_FILE_SUBVERSION 24
+#define BLENDER_FILE_SUBVERSION 25
 
 /* Minimum Blender version that supports reading file written with the current
  * version. Older Blender versions will test this and show a warning if the file
diff --git a/source/blender/blenkernel/intern/layer.c b/source/blender/blenkernel/intern/layer.c
index b489675cd74..434a2296d95 100644
--- a/source/blender/blenkernel/intern/layer.c
+++ b/source/blender/blenkernel/intern/layer.c
@@ -183,7 +183,6 @@ static ViewLayer *view_layer_add(const char *name)
   view_layer->passflag = SCE_PASS_COMBINED;
   view_layer->pass_alpha_threshold = 0.5f;
   view_layer->cryptomatte_levels = 6;
-  view_layer->cryptomatte_flag = VIEW_LAYER_CRYPTOMATTE_ACCURATE;
   BKE_freestyle_config_init(&view_layer->freestyle_config);
 
   return view_layer;
diff --git a/source/blender/blenloader/intern/versioning_270.c b/source/blender/blenloader/intern/versioning_270.c
index fa15e541e43..54d1efab7dd 100644
--- a/source/blender/blenloader/intern/versioning_270.c
+++ b/source/blender/blenloader/intern/versioning_270.c
@@ -651,13 +651,6 @@ void blo_do_versions_270(FileData *fd, Library *UNUSED(lib), Main *bmain)
         mat->line_col[3] = mat->alpha;
       }
     }
-
-    if (!DNA_struct_elem_find(fd->filesdna, "RenderData", "int", "preview_start_resolution")) {
-      Scene *scene;
-      for (scene = bmain->scenes.first; scene; scene = scene->id.next) {
-        scene->r.preview_start_resolution = 64;
-      }
-    }
   }
 
   if (!MAIN_VERSION_ATLEAST(bmain, 271, 3)) {
@@ -698,15 +691,6 @@ void blo_do_versions_270(FileData *fd, Library *UNUSED(lib), Main *bmain)
     }
   }
 
-  if (!MAIN_VERSION_ATLEAST(bmain, 272, 0)) {
-    if (!DNA_struct_elem_find(fd->filesdna, "RenderData", "int", "preview_start_resolution")) {
-      Scene *scene;
-      for (scene = bmain->scenes.first; scene; scene = scene->id.next) {
-        scene->r.preview_start_resolution = 64;
-      }
-    }
-  }
-
   if (!MAIN_VERSION_ATLEAST(bmain, 272, 1)) {
     Brush *br;
     for (br = bmain->brushes.first; br; br = br->id.next) {
diff --git a/source/blender/blenloader/intern/versioning_280.c b/source/blender/blenloader/intern/versioning_280.c
index 9f2c090c242..69b67460a5d 100644
--- a/source/blender/blenloader/intern/versioning_280.c
+++ b/source/blender/blenloader/intern/versioning_280.c
@@ -3718,7 +3718,7 @@ void blo_do_versions_280(FileData *fd, Library *UNUSED(lib), Main *bmain)
             STRNCPY(node->idname, "ShaderNodeOutputLight");
           }
           if (node->type == SH_NODE_BSDF_PRINCIPLED && node->custom2 == 0) {
-            node->custom2 = SHD_SUBSURFACE_BURLEY;
+            node->custom2 = SHD_SUBSURFACE_DIFFUSION;
           }
         }
       }
diff --git a/source/blender/blenloader/intern/versioning_290.c b/source/blender/blenloader/intern/versioning_290.c
index bafba486c88..be8c4b735be 100644
--- a/source/blender/blenloader/intern/versioning_290.c
+++ b/source/blender/blenloader/intern/versioning_290.c
@@ -1461,7 +1461,6 @@ void blo_do_versions_290(FileData *fd, Library *UNUSED(lib), Main *bmain)
       LISTBASE_FOREACH (Scene *, scene, &bmain->scenes) {
         LISTBASE_FOREACH (ViewLayer *, view_layer, &scene->view_layers) {
           view_layer->cryptomatte_levels = 6;
-          view_layer->cryptomatte_flag = VIEW_LAYER_CRYPTOMATTE_ACCURATE;
         }
       }
     }
diff --git a/source/blender/blenloader/intern/versioning_300.c b/source/blender/blenloader/intern/versioning_300.c
index 1a19bbbee5c..4dc6a0ecea6 100644
--- a/source/blender/blenloader/intern/versioning_300.c
+++ b/source/blender/blenloader/intern/versioning_300.c
@@ -784,6 +784,20 @@ static bool seq_transform_origin_set(Sequence *seq, void *UNUSED(user_data))
   return true;
 }
 
+static void do_version_subsurface_methods(bNode *node)
+{
+  if (node->type == SH_NODE_SUBSURFACE_SCATTERING) {
+    if (node->custom1 != SHD_SUBSURFACE_RANDOM_WALK) {
+      node->custom1 = SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS;
+    }
+  }
+  else if (node->type == SH_NODE_BSDF_PRINCIPLED) {
+    if (node->custom2 != SHD_SUBSURFACE_RANDOM_WALK) {
+      node->custom2 = SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS;
+    }
+  }
+}
+
 /* NOLINTNEXTLINE: readability-function-size */
 void blo_do_versions_300(FileData *fd, Library *UNUSED(lib), Main *bmain)
 {
@@ -1336,6 +1350,25 @@ void blo_do_versions_300(FileData *fd, Library *UNUSED(lib), Main *bmain)
     }
   }
 
+  if (!MAIN_VERSION_ATLEAST(bmain, 300, 25)) {
+    FOREACH_NODETREE_BEGIN (bmain, ntree, id) {
+      if (ntree->type == NTREE_SHADER) {
+        LISTBASE_FOREACH (bNode *, node, &ntree->nodes) {
+          do_version_subsurface_methods(node);
+        }
+      }
+    }
+    FOREACH_NODETREE_END;
+
+    enum {
+      R_EXR_TILE_FILE = (1 << 10),
+      R_FULL_SAMPLE = (1 << 15),
+    };
+    LISTBASE_FOREACH (Scene *, scene, &bmain->scenes) {
+      scene->r.scemode &= ~(R_EXR_TILE_FILE | R_FULL_SAMPLE);
+    }
+  }
+
   /**
    * Versioning code until next subversion bump goes here.
    *
diff --git a/source/blender/blenloader/intern/versioning_cycles.c b/source/blender/blenloader/intern/versioning_cycles.c
index 90e6b43f02e..da57f27af4e 100644
--- a/source/blender/blenloader/intern/versioning_cycles.c
+++ b/source/blender/blenloader/intern/versioning_cycles.c
@@ -182,8 +182,8 @@ static void displacement_principled_nodes(bNode *node)
     }
   }
   else if (node->type == SH_NODE_BSDF_PRINCIPLED) {
-    if (node->custom2 != SHD_SUBSURFACE_RANDOM_WALK) {
-      node->custom2 = SHD_SUBSURFACE_BURLEY;
+    if (node->custom2 != SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS) {
+      node->custom2 = SHD_SUBSURFACE_DIFFUSION;
     }
   }
 }
@@ -1373,6 +1373,11 @@ void blo_do_versions_cycles(FileData *UNUSED(fd), Library *UNUSED(lib), Main *bm
 
 void do_versions_after_linking_cycles(Main *bmain)
 {
+  const int DENOISER_AUTO = 0;
+  const int DENOISER_NLM = 1;
+  const int DENOISER_OPTIX = 2;
+  const int DENOISER_OPENIMAGEDENOISE = 4;
+
   if (!MAIN_VERSION_ATLEAST(bmain, 280, 66)) {
     /* Shader node tree changes. After lib linking so we have all the typeinfo
      * pointers and updated sockets and we can use the high level node API to
@@ -1578,10 +1583,6 @@ void do_versions_after_linking_cycles(Main *bmain)
       }
 
       if (cscene) {
-        const int DENOISER_AUTO = 0;
-        const int DENOISER_NLM = 1;
-        const int DENOISER_OPTIX = 2;
-
         /* Enable denoiser if it was enabled for one view layer before. */
         cycles_property_int_set(cscene, "denoiser", (use_optix) ? DENOISER_OPTIX : DENOISER_NLM);
         cycles_property_boolean_set(cscene, "use_denoising", use_denoising);
@@ -1637,4 +1638,17 @@ void do_versions_after_linking_cycles(Main *bmain)
       object->visibility_flag |= flag;
     }
   }
+
+  if (!MAIN_VERSION_ATLEAST(bmain, 300, 25)) {
+    /* Removal of NLM denoiser. */
+    for (Scene *scene = bmain->scenes.first; scene; scene = scene->id.next) {
+      IDProperty *cscene = cycles_properties_from_ID(&scene->id);
+
+      if (cscene) {
+        if (cycles_property_int(cscene, "denoiser", DENOISER_NLM) == DENOISER_NLM) {
+          cycles_property_int_set(cscene, "denoiser", DENOISER_OPENIMAGEDENOISE);
+        }
+      }
+    }
+  }
 }
diff --git a/source/blender/blenloader/intern/versioning_defaults.c b/source/blender/blenloader/intern/versioning_defaults.c
index f2d5896be03..152ef79a38f 100644
--- a/source/blender/blenloader/intern/versioning_defaults.c
+++ b/source/blender/blenloader/intern/versioning_defaults.c
@@ -54,6 +54,7 @@
 #include "BKE_curveprofile.h"
 #include "BKE_customdata.h"
 #include "BKE_gpencil.h"
+#include "BKE_idprop.h"
 #include "BKE_layer.h"
 #include "BKE_lib_id.h"
 #include "BKE_main.h"
@@ -356,6 +357,12 @@ static void blo_update_defaults_scene(Main *bmain, Scene *scene)
   if (ts->custom_bevel_profile_preset == NULL) {
     ts->custom_bevel_profile_preset = BKE_curveprofile_add(PROF_PRESET_LINE);
   }
+
+  /* Clear ID properties so Cycles gets defaults. */
+  IDProperty *idprop = IDP_GetProperties(&scene->id, false);
+  if (idprop) {
+    IDP_ClearProperty(idprop);
+  }
 }
 
 /**
@@ -582,6 +589,10 @@ void BLO_update_defaults_startup_blend(Main *bmain, const char *app_template)
           bNodeSocket *roughness_socket = nodeFindSocket(node, SOCK_IN, "Roughness");
           bNodeSocketValueFloat *roughness_data = roughness_socket->default_value;
           roughness_data->value = 0.4f;
+          node->custom2 = SHD_SUBSURFACE_RANDOM_WALK;
+        }
+        else if (node->type == SH_NODE_SUBSURFACE_SCATTERING) {
+          node->custom1 = SHD_SUBSURFACE_RANDOM_WALK;
         }
       }
     }
diff --git a/source/blender/compositor/nodes/COM_IDMaskNode.cc b/source/blender/compositor/nodes/COM_IDMaskNode.cc
index b51e79f2dea..761cb8b98cf 100644
--- a/source/blender/compositor/nodes/COM_IDMaskNode.cc
+++ b/source/blender/compositor/nodes/COM_IDMaskNode.cc
@@ -28,7 +28,7 @@ IDMaskNode::IDMaskNode(bNode *editorNode) : Node(editorNode)
   /* pass */
 }
 void IDMaskNode::convertToOperations(NodeConverter &converter,
-                                     const CompositorContext &context) const
+                                     const CompositorContext & /*context*/) const
 {
   bNode *bnode = this->getbNode();
 
@@ -38,7 +38,7 @@ void IDMaskNode::convertToOperations(NodeConverter &converter,
   converter.addOperation(operation);
 
   converter.mapInputSocket(getInputSocket(0), operation->getInputSocket(0));
-  if (bnode->custom2 == 0 || context.getRenderData()->scemode & R_FULL_SAMPLE) {
+  if (bnode->custom2 == 0) {
     converter.mapOutputSocket(getOutputSocket(0), operation->getOutputSocket(0));
   }
   else {
diff --git a/source/blender/compositor/nodes/COM_ZCombineNode.cc b/source/blender/compositor/nodes/COM_ZCombineNode.cc
index ddf66740578..e29748dc317 100644
--- a/source/blender/compositor/nodes/COM_ZCombineNode.cc
+++ b/source/blender/compositor/nodes/COM_ZCombineNode.cc
@@ -31,9 +31,9 @@
 namespace blender::compositor {
 
 void ZCombineNode::convertToOperations(NodeConverter &converter,
-                                       const CompositorContext &context) const
+                                       const CompositorContext & /*context*/) const
 {
-  if ((context.getRenderData()->scemode & R_FULL_SAMPLE) || this->getbNode()->custom2) {
+  if (this->getbNode()->custom2) {
     ZCombineOperation *operation = nullptr;
     if (this->getbNode()->custom1) {
       operation = new ZCombineAlphaOperation();
diff --git a/source/blender/draw/DRW_engine.h b/source/blender/draw/DRW_engine.h
index a125a13eaf9..2e25211ea62 100644
--- a/source/blender/draw/DRW_engine.h
+++ b/source/blender/draw/DRW_engine.h
@@ -176,6 +176,9 @@ void DRW_deferred_shader_remove(struct GPUMaterial *mat);
 struct DrawDataList *DRW_drawdatalist_from_id(struct ID *id);
 void DRW_drawdata_free(struct ID *id);
 
+bool DRW_opengl_context_release(void);
+void DRW_opengl_context_activate(bool test);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/draw/engines/eevee/eevee_cryptomatte.c b/source/blender/draw/engines/eevee/eevee_cryptomatte.c
index 76a1b561972..49780abc6f4 100644
--- a/source/blender/draw/engines/eevee/eevee_cryptomatte.c
+++ b/source/blender/draw/engines/eevee/eevee_cryptomatte.c
@@ -139,8 +139,6 @@ void EEVEE_cryptomatte_renderpasses_init(EEVEE_Data *vedata)
     g_data->cryptomatte_session = session;
 
     g_data->render_passes |= EEVEE_RENDER_PASS_CRYPTOMATTE | EEVEE_RENDER_PASS_VOLUME_LIGHT;
-    g_data->cryptomatte_accurate_mode = (view_layer->cryptomatte_flag &
-                                         VIEW_LAYER_CRYPTOMATTE_ACCURATE) != 0;
   }
 }
 
@@ -405,7 +403,6 @@ void EEVEE_cryptomatte_output_accumulate(EEVEE_ViewLayerData *UNUSED(sldata), EE
 {
   EEVEE_FramebufferList *fbl = vedata->fbl;
   EEVEE_StorageList *stl = vedata->stl;
-  EEVEE_PrivateData *g_data = stl->g_data;
   EEVEE_EffectsInfo *effects = stl->effects;
   EEVEE_PassList *psl = vedata->psl;
   const DRWContextState *draw_ctx = DRW_context_state_get();
@@ -413,10 +410,9 @@ void EEVEE_cryptomatte_output_accumulate(EEVEE_ViewLayerData *UNUSED(sldata), EE
   const int cryptomatte_levels = view_layer->cryptomatte_levels;
   const int current_sample = effects->taa_current_sample;
 
-  /* In accurate mode all render samples are evaluated. In inaccurate mode this is limited to the
-   * number of cryptomatte levels. This will reduce the overhead of downloading the GPU buffer and
-   * integrating it into the accum buffer. */
-  if (g_data->cryptomatte_accurate_mode || current_sample < cryptomatte_levels) {
+  /* Render samples used by cryptomatte are limited to the number of cryptomatte levels. This will
+   * reduce the overhead of downloading the GPU buffer and integrating it into the accum buffer. */
+  if (current_sample < cryptomatte_levels) {
     static float clear_color[4] = {0.0};
     GPU_framebuffer_bind(fbl->cryptomatte_fb);
     GPU_framebuffer_clear_color(fbl->cryptomatte_fb, clear_color);
diff --git a/source/blender/draw/engines/eevee/eevee_engine.c b/source/blender/draw/engines/eevee/eevee_engine.c
index 6a66e8b1a58..f8e1cc9c923 100644
--- a/source/blender/draw/engines/eevee/eevee_engine.c
+++ b/source/blender/draw/engines/eevee/eevee_engine.c
@@ -648,6 +648,8 @@ RenderEngineType DRW_engine_viewport_eevee_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &EEVEE_render_update_passes,
     &draw_engine_eevee_type,
     {NULL, NULL, NULL},
diff --git a/source/blender/draw/engines/eevee/eevee_private.h b/source/blender/draw/engines/eevee/eevee_private.h
index f51b4fa0127..eae5d161cc3 100644
--- a/source/blender/draw/engines/eevee/eevee_private.h
+++ b/source/blender/draw/engines/eevee/eevee_private.h
@@ -1042,7 +1042,6 @@ typedef struct EEVEE_PrivateData {
   int aov_hash;
   int num_aovs_used;
   struct CryptomatteSession *cryptomatte_session;
-  bool cryptomatte_accurate_mode;
   EEVEE_CryptomatteSample *cryptomatte_accum_buffer;
   float *cryptomatte_download_buffer;
 
diff --git a/source/blender/draw/engines/external/external_engine.c b/source/blender/draw/engines/external/external_engine.c
index 89ee3f1b293..cc548a53a8e 100644
--- a/source/blender/draw/engines/external/external_engine.c
+++ b/source/blender/draw/engines/external/external_engine.c
@@ -32,13 +32,19 @@
 #include "BKE_object.h"
 #include "BKE_particle.h"
 
+#include "ED_image.h"
 #include "ED_screen.h"
 
+#include "GPU_batch.h"
+#include "GPU_debug.h"
 #include "GPU_matrix.h"
 #include "GPU_shader.h"
 #include "GPU_state.h"
 #include "GPU_viewport.h"
 
+#include "RE_engine.h"
+#include "RE_pipeline.h"
+
 #include "external_engine.h" /* own include */
 
 /* Shaders */
@@ -137,6 +143,22 @@ static void external_engine_init(void *vedata)
   }
 }
 
+/* Add shading group call which will take care of writing to the depth buffer, so that the
+ * alpha-under overlay will happen for the render buffer. */
+static void external_cache_image_add(DRWShadingGroup *grp)
+{
+  float obmat[4][4];
+  unit_m4(obmat);
+  scale_m4_fl(obmat, 0.5f);
+
+  /* NOTE: Use the same Z-depth value as in the regular image drawing engine. */
+  translate_m4(obmat, 1.0f, 1.0f, 0.75f);
+
+  GPUBatch *geom = DRW_cache_quad_get();
+
+  DRW_shgroup_call_obmat(grp, geom, obmat);
+}
+
 static void external_cache_init(void *vedata)
 {
   EXTERNAL_PassList *psl = ((EXTERNAL_Data *)vedata)->psl;
@@ -162,14 +184,33 @@ static void external_cache_init(void *vedata)
     stl->g_data->depth_shgrp = DRW_shgroup_create(e_data.depth_sh, psl->depth_pass);
   }
 
-  /* Do not draw depth pass when overlays are turned off. */
-  stl->g_data->need_depth = (v3d->flag2 & V3D_HIDE_OVERLAYS) == 0;
+  if (v3d != NULL) {
+    /* Do not draw depth pass when overlays are turned off. */
+    stl->g_data->need_depth = (v3d->flag2 & V3D_HIDE_OVERLAYS) == 0;
+  }
+  else if (draw_ctx->space_data != NULL) {
+    const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+    if (space_type == SPACE_IMAGE) {
+      external_cache_image_add(stl->g_data->depth_shgrp);
+
+      stl->g_data->need_depth = true;
+      stl->g_data->update_depth = true;
+    }
+  }
 }
 
 static void external_cache_populate(void *vedata, Object *ob)
 {
+  const DRWContextState *draw_ctx = DRW_context_state_get();
   EXTERNAL_StorageList *stl = ((EXTERNAL_Data *)vedata)->stl;
 
+  if (draw_ctx->space_data != NULL) {
+    const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+    if (space_type == SPACE_IMAGE) {
+      return;
+    }
+  }
+
   if (!(DRW_object_is_renderable(ob) &&
         DRW_object_visibility_in_active_context(ob) & OB_VISIBLE_SELF)) {
     return;
@@ -210,13 +251,11 @@ static void external_cache_finish(void *UNUSED(vedata))
 {
 }
 
-static void external_draw_scene_do(void *vedata)
+static void external_draw_scene_do_v3d(void *vedata)
 {
   const DRWContextState *draw_ctx = DRW_context_state_get();
-  Scene *scene = draw_ctx->scene;
   RegionView3D *rv3d = draw_ctx->rv3d;
   ARegion *region = draw_ctx->region;
-  const RenderEngineType *type;
 
   DRW_state_reset_ex(DRW_STATE_DEFAULT & ~DRW_STATE_DEPTH_LESS_EQUAL);
 
@@ -229,8 +268,6 @@ static void external_draw_scene_do(void *vedata)
     }
 
     RenderEngine *engine = RE_engine_create(engine_type);
-    engine->tile_x = scene->r.tilex;
-    engine->tile_y = scene->r.tiley;
     engine_type->view_update(engine, draw_ctx->evil_C, draw_ctx->depsgraph);
     rv3d->render_engine = engine;
   }
@@ -241,7 +278,7 @@ static void external_draw_scene_do(void *vedata)
   ED_region_pixelspace(region);
 
   /* Render result draw. */
-  type = rv3d->render_engine->type;
+  const RenderEngineType *type = rv3d->render_engine->type;
   type->view_draw(rv3d->render_engine, draw_ctx->evil_C, draw_ctx->depsgraph);
 
   GPU_bgl_end();
@@ -259,6 +296,116 @@ static void external_draw_scene_do(void *vedata)
   }
 }
 
+/* Configure current matrix stack so that the external engine can use the same drawing code for
+ * both viewport and image editor drawing.
+ *
+ * The engine draws result in the pixel space, and is applying render offset. For image editor we
+ * need to switch from normalized space to pixel space, and "un-apply" offset. */
+static void external_image_space_matrix_set(const RenderEngine *engine)
+{
+  BLI_assert(engine != NULL);
+
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+  const DRWView *view = DRW_view_get_active();
+  struct SpaceImage *space_image = (struct SpaceImage *)draw_ctx->space_data;
+
+  /* Apply current view as transformation matrix.
+   * This will configure drawing for normalized space with current zoom and pan applied. */
+
+  float view_matrix[4][4];
+  DRW_view_viewmat_get(view, view_matrix, false);
+
+  float projection_matrix[4][4];
+  DRW_view_winmat_get(view, projection_matrix, false);
+
+  GPU_matrix_projection_set(projection_matrix);
+  GPU_matrix_set(view_matrix);
+
+  /* Switch from normalized space to pixel space. */
+  {
+    int width, height;
+    ED_space_image_get_size(space_image, &width, &height);
+
+    const float width_inv = width ? 1.0f / width : 0.0f;
+    const float height_inv = height ? 1.0f / height : 0.0f;
+    GPU_matrix_scale_2f(width_inv, height_inv);
+  }
+
+  /* Un-apply render offset. */
+  {
+    Render *render = engine->re;
+    rctf view_rect;
+    rcti render_rect;
+    RE_GetViewPlane(render, &view_rect, &render_rect);
+
+    GPU_matrix_translate_2f(-render_rect.xmin, -render_rect.ymin);
+  }
+}
+
+static void external_draw_scene_do_image(void *UNUSED(vedata))
+{
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+  Scene *scene = draw_ctx->scene;
+  Render *re = RE_GetSceneRender(scene);
+  RenderEngine *engine = RE_engine_get(re);
+
+  /* Is tested before enabling the drawing engine. */
+  BLI_assert(re != NULL);
+  BLI_assert(engine != NULL);
+
+  const DefaultFramebufferList *dfbl = DRW_viewport_framebuffer_list_get();
+
+  /* Clear the depth buffer to the value used by the background overlay so that the overlay is not
+   * happening outside of the drawn image.
+   *
+   * NOTE: The external engine only draws color. The depth is taken care of using the depth pass
+   * which initialized the depth to the values expected by the background overlay. */
+  GPU_framebuffer_clear_depth(dfbl->default_fb, 1.0f);
+
+  GPU_matrix_push_projection();
+  GPU_matrix_push();
+
+  external_image_space_matrix_set(engine);
+
+  GPU_debug_group_begin("External Engine");
+
+  const RenderEngineType *engine_type = engine->type;
+  BLI_assert(engine_type != NULL);
+  BLI_assert(engine_type->draw != NULL);
+
+  engine_type->draw(engine, draw_ctx->evil_C, draw_ctx->depsgraph);
+
+  GPU_debug_group_end();
+
+  GPU_matrix_pop();
+  GPU_matrix_pop_projection();
+
+  DRW_state_reset();
+  GPU_bgl_end();
+
+  RE_engine_draw_release(re);
+}
+
+static void external_draw_scene_do(void *vedata)
+{
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+
+  if (draw_ctx->v3d != NULL) {
+    external_draw_scene_do_v3d(vedata);
+    return;
+  }
+
+  if (draw_ctx->space_data == NULL) {
+    return;
+  }
+
+  const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+  if (space_type == SPACE_IMAGE) {
+    external_draw_scene_do_image(vedata);
+    return;
+  }
+}
+
 static void external_draw_scene(void *vedata)
 {
   const DRWContextState *draw_ctx = DRW_context_state_get();
@@ -297,7 +444,7 @@ static void external_engine_free(void)
 
 static const DrawEngineDataSize external_data_size = DRW_VIEWPORT_DATA_SIZE(EXTERNAL_Data);
 
-static DrawEngineType draw_engine_external_type = {
+DrawEngineType draw_engine_external_type = {
     NULL,
     NULL,
     N_("External"),
@@ -330,8 +477,45 @@ RenderEngineType DRW_engine_viewport_external_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &draw_engine_external_type,
     {NULL, NULL, NULL},
 };
 
+bool DRW_engine_external_acquire_for_image_editor(void)
+{
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+  const SpaceLink *space_data = draw_ctx->space_data;
+  Scene *scene = draw_ctx->scene;
+
+  if (space_data == NULL) {
+    return false;
+  }
+
+  const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+  if (space_type != SPACE_IMAGE) {
+    return false;
+  }
+
+  struct SpaceImage *space_image = (struct SpaceImage *)space_data;
+  const Image *image = ED_space_image(space_image);
+  if (image == NULL || image->type != IMA_TYPE_R_RESULT) {
+    return false;
+  }
+
+  if (image->render_slot != image->last_render_slot) {
+    return false;
+  }
+
+  /* Render is allocated on main thread, so it is safe to access it from here. */
+  Render *re = RE_GetSceneRender(scene);
+
+  if (re == NULL) {
+    return false;
+  }
+
+  return RE_engine_draw_acquire(re);
+}
+
 #undef EXTERNAL_ENGINE
diff --git a/source/blender/draw/engines/external/external_engine.h b/source/blender/draw/engines/external/external_engine.h
index c645fb99e0e..14ec4e2d3c5 100644
--- a/source/blender/draw/engines/external/external_engine.h
+++ b/source/blender/draw/engines/external/external_engine.h
@@ -22,4 +22,12 @@
 
 #pragma once
 
+extern DrawEngineType draw_engine_external_type;
 extern RenderEngineType DRW_engine_viewport_external_type;
+
+/* Check whether an external engine is to be used to draw content of an image editor.
+ * If the drawing is possible, the render engine is "acquired" so that it is not freed by the
+ * render engine for until drawing is finished.
+ *
+ * NOTE: Released by the draw engine when it is done drawing. */
+bool DRW_engine_external_acquire_for_image_editor(void);
diff --git a/source/blender/draw/engines/select/select_engine.c b/source/blender/draw/engines/select/select_engine.c
index 96ab8a28e09..20edd78597b 100644
--- a/source/blender/draw/engines/select/select_engine.c
+++ b/source/blender/draw/engines/select/select_engine.c
@@ -388,6 +388,8 @@ RenderEngineType DRW_engine_viewport_select_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &draw_engine_select_type,
     {NULL, NULL, NULL},
 };
diff --git a/source/blender/draw/engines/workbench/workbench_engine.c b/source/blender/draw/engines/workbench/workbench_engine.c
index f09c019ef8d..635aa7cef25 100644
--- a/source/blender/draw/engines/workbench/workbench_engine.c
+++ b/source/blender/draw/engines/workbench/workbench_engine.c
@@ -651,6 +651,8 @@ RenderEngineType DRW_engine_viewport_workbench_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &workbench_render_update_passes,
     &draw_engine_workbench,
     {NULL, NULL, NULL},
diff --git a/source/blender/draw/intern/DRW_render.h b/source/blender/draw/intern/DRW_render.h
index 660a4adaf51..fb8b8536897 100644
--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@@ -623,6 +623,7 @@ const DRWView *DRW_view_default_get(void);
 void DRW_view_default_set(DRWView *view);
 void DRW_view_reset(void);
 void DRW_view_set_active(DRWView *view);
+const DRWView *DRW_view_get_active(void);
 
 void DRW_view_clip_planes_set(DRWView *view, float (*planes)[4], int plane_len);
 void DRW_view_camtexco_set(DRWView *view, float texco[4]);
diff --git a/source/blender/draw/intern/draw_manager.c b/source/blender/draw/intern/draw_manager.c
index 47adc0acc60..e65fdce5f2e 100644
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@@ -1197,6 +1197,18 @@ static void drw_engines_enable_basic(void)
   use_drw_engine(&draw_engine_basic_type);
 }
 
+static void drw_engine_enable_image_editor(void)
+{
+  if (DRW_engine_external_acquire_for_image_editor()) {
+    use_drw_engine(&draw_engine_external_type);
+  }
+  else {
+    use_drw_engine(&draw_engine_image_type);
+  }
+
+  use_drw_engine(&draw_engine_overlay_type);
+}
+
 static void drw_engines_enable_editors(void)
 {
   SpaceLink *space_data = DST.draw_ctx.space_data;
@@ -1205,8 +1217,7 @@ static void drw_engines_enable_editors(void)
   }
 
   if (space_data->spacetype == SPACE_IMAGE) {
-    use_drw_engine(&draw_engine_image_type);
-    use_drw_engine(&draw_engine_overlay_type);
+    drw_engine_enable_image_editor();
   }
   else if (space_data->spacetype == SPACE_NODE) {
     /* Only enable when drawing the space image backdrop. */
@@ -3188,3 +3199,66 @@ void DRW_draw_state_init_gtests(eGPUShaderConfig sh_cfg)
 #endif
 
 /** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name Draw manager context release/activation
+ *
+ * These functions are used in cases when an OpenGL context creation is needed during the draw.
+ * This happens, for example, when an external engine needs to create its own OpenGL context from
+ * the engine initialization.
+ *
+ * Example of context creation:
+ *
+ *   const bool drw_state = DRW_opengl_context_release();
+ *   gl_context = WM_opengl_context_create();
+ *   DRW_opengl_context_activate(drw_state);
+ *
+ * Example of context destruction:
+ *
+ *   const bool drw_state = DRW_opengl_context_release();
+ *   WM_opengl_context_activate(gl_context);
+ *   WM_opengl_context_dispose(gl_context);
+ *   DRW_opengl_context_activate(drw_state);
+ *
+ *
+ * NOTE: Will only perform context modification when on main thread. This way these functions can
+ * be used in an engine without check on whether it is a draw manager which manages OpenGL context
+ * on the current thread. The downside of this is that if the engine performs OpenGL creation from
+ * a non-main thread, that thread is supposed to not have OpenGL context ever bound by Blender.
+ *
+ * \{ */
+
+bool DRW_opengl_context_release(void)
+{
+  if (!BLI_thread_is_main()) {
+    return false;
+  }
+
+  if (GPU_context_active_get() != DST.gpu_context) {
+    /* Context release is requested from the outside of the draw manager main draw loop, indicate
+     * this to the `DRW_opengl_context_activate()` so that it restores drawable of the window. */
+    return false;
+  }
+
+  GPU_context_active_set(NULL);
+  WM_opengl_context_release(DST.gl_context);
+
+  return true;
+}
+
+void DRW_opengl_context_activate(bool drw_state)
+{
+  if (!BLI_thread_is_main()) {
+    return;
+  }
+
+  if (drw_state) {
+    WM_opengl_context_activate(DST.gl_context);
+    GPU_context_active_set(DST.gpu_context);
+  }
+  else {
+    wm_window_reset_drawable();
+  }
+}
+
+/** \} */
diff --git a/source/blender/draw/intern/draw_manager_exec.c b/source/blender/draw/intern/draw_manager_exec.c
index 22356a3c57b..aa01ca7a262 100644
--- a/source/blender/draw/intern/draw_manager_exec.c
+++ b/source/blender/draw/intern/draw_manager_exec.c
@@ -367,6 +367,11 @@ void DRW_view_set_active(DRWView *view)
   DST.view_active = (view) ? view : DST.view_default;
 }
 
+const DRWView *DRW_view_get_active(void)
+{
+  return DST.view_active;
+}
+
 /* Return True if the given BoundSphere intersect the current view frustum */
 static bool draw_culling_sphere_test(const BoundSphere *frustum_bsphere,
                                      const float (*frustum_planes)[4],
diff --git a/source/blender/editors/object/object_bake_api.c b/source/blender/editors/object/object_bake_api.c
index 0a2df655395..26f5b21a311 100644
--- a/source/blender/editors/object/object_bake_api.c
+++ b/source/blender/editors/object/object_bake_api.c
@@ -412,6 +412,7 @@ static bool is_noncolor_pass(eScenePassType pass_type)
 {
   return ELEM(pass_type,
               SCE_PASS_Z,
+              SCE_PASS_POSITION,
               SCE_PASS_NORMAL,
               SCE_PASS_VECTOR,
               SCE_PASS_INDEXOB,
@@ -554,19 +555,10 @@ static bool bake_pass_filter_check(eScenePassType pass_type,
           return true;
         }
 
-        if ((pass_filter & R_BAKE_PASS_FILTER_AO) != 0) {
-          BKE_report(
-              reports,
-              RPT_ERROR,
-              "Combined bake pass Ambient Occlusion contribution requires an enabled light pass "
-              "(bake the Ambient Occlusion pass type instead)");
-        }
-        else {
-          BKE_report(reports,
-                     RPT_ERROR,
-                     "Combined bake pass requires Emit, or a light pass with "
-                     "Direct or Indirect contributions enabled");
-        }
+        BKE_report(reports,
+                   RPT_ERROR,
+                   "Combined bake pass requires Emit, or a light pass with "
+                   "Direct or Indirect contributions enabled");
 
         return false;
       }
diff --git a/source/blender/editors/render/render_preview.c b/source/blender/editors/render/render_preview.c
index 95351de45f0..81aecfdf788 100644
--- a/source/blender/editors/render/render_preview.c
+++ b/source/blender/editors/render/render_preview.c
@@ -479,15 +479,6 @@ static Scene *preview_prepare_scene(
     BKE_color_managed_view_settings_free(&sce->view_settings);
     BKE_color_managed_view_settings_copy(&sce->view_settings, &scene->view_settings);
 
-    /* prevent overhead for small renders and icons (32) */
-    if (id && sp->sizex < 40) {
-      sce->r.tilex = sce->r.tiley = 64;
-    }
-    else {
-      sce->r.tilex = sce->r.xsch / 4;
-      sce->r.tiley = sce->r.ysch / 4;
-    }
-
     if ((id && sp->pr_method == PR_ICON_RENDER) && id_type != ID_WO) {
       sce->r.alphamode = R_ALPHAPREMUL;
     }
diff --git a/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp b/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp
index 937a10f26b1..0a82c237256 100644
--- a/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp
+++ b/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp
@@ -94,17 +94,15 @@ BlenderStrokeRenderer::BlenderStrokeRenderer(Render *re, int render_count)
   freestyle_scene = BKE_scene_add(freestyle_bmain, name);
   freestyle_scene->r.cfra = old_scene->r.cfra;
   freestyle_scene->r.mode = old_scene->r.mode & ~(R_EDGE_FRS | R_BORDER);
-  freestyle_scene->r.xsch = re->rectx;  // old_scene->r.xsch
-  freestyle_scene->r.ysch = re->recty;  // old_scene->r.ysch
-  freestyle_scene->r.xasp = 1.0f;       // old_scene->r.xasp;
-  freestyle_scene->r.yasp = 1.0f;       // old_scene->r.yasp;
-  freestyle_scene->r.tilex = old_scene->r.tilex;
-  freestyle_scene->r.tiley = old_scene->r.tiley;
+  freestyle_scene->r.xsch = re->rectx;    // old_scene->r.xsch
+  freestyle_scene->r.ysch = re->recty;    // old_scene->r.ysch
+  freestyle_scene->r.xasp = 1.0f;         // old_scene->r.xasp;
+  freestyle_scene->r.yasp = 1.0f;         // old_scene->r.yasp;
   freestyle_scene->r.size = 100;          // old_scene->r.size
   freestyle_scene->r.color_mgt_flag = 0;  // old_scene->r.color_mgt_flag;
   freestyle_scene->r.scemode = (old_scene->r.scemode &
                                 ~(R_SINGLE_LAYER | R_NO_FRAME_UPDATE | R_MULTIVIEW)) &
-                               (re->r.scemode | ~R_FULL_SAMPLE);
+                               (re->r.scemode);
   freestyle_scene->r.flag = old_scene->r.flag;
   freestyle_scene->r.threads = old_scene->r.threads;
   freestyle_scene->r.border.xmin = old_scene->r.border.xmin;
diff --git a/source/blender/gpu/GPU_material.h b/source/blender/gpu/GPU_material.h
index 312da491a36..e64521768f9 100644
--- a/source/blender/gpu/GPU_material.h
+++ b/source/blender/gpu/GPU_material.h
@@ -175,10 +175,7 @@ GPUNodeLink *GPU_uniformbuf_link_out(struct GPUMaterial *mat,
 void GPU_material_output_link(GPUMaterial *material, GPUNodeLink *link);
 void GPU_material_add_output_link_aov(GPUMaterial *material, GPUNodeLink *link, int hash);
 
-void GPU_material_sss_profile_create(GPUMaterial *material,
-                                     float radii[3],
-                                     const short *falloff_type,
-                                     const float *sharpness);
+void GPU_material_sss_profile_create(GPUMaterial *material, float radii[3]);
 struct GPUUniformBuf *GPU_material_sss_profile_get(GPUMaterial *material,
                                                    int sample_len,
                                                    struct GPUTexture **tex_profile);
diff --git a/source/blender/gpu/intern/gpu_material.c b/source/blender/gpu/intern/gpu_material.c
index 56e72fbeca9..6872a08e854 100644
--- a/source/blender/gpu/intern/gpu_material.c
+++ b/source/blender/gpu/intern/gpu_material.c
@@ -96,8 +96,6 @@ struct GPUMaterial {
   float sss_enabled;
   float sss_radii[3];
   int sss_samples;
-  short int sss_falloff;
-  float sss_sharpness;
   bool sss_dirty;
 
   GPUTexture *coba_tex; /* 1D Texture array containing all color bands. */
@@ -266,18 +264,6 @@ static void sss_calculate_offsets(GPUSssKernelData *kd, int count, float exponen
   }
 }
 
-#define GAUSS_TRUNCATE 12.46f
-static float gaussian_profile(float r, float radius)
-{
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  if (r >= Rm) {
-    return 0.0f;
-  }
-  return expf(-r * r / (2.0f * v)) / (2.0f * M_PI * v);
-}
-
 #define BURLEY_TRUNCATE 16.0f
 #define BURLEY_TRUNCATE_CDF 0.9963790093708328f  // cdf(BURLEY_TRUNCATE)
 static float burley_profile(float r, float d)
@@ -287,45 +273,15 @@ static float burley_profile(float r, float d)
   return (exp_r_d + exp_r_3_d) / (4.0f * d);
 }
 
-static float cubic_profile(float r, float radius, float sharpness)
-{
-  float Rm = radius * (1.0f + sharpness);
-
-  if (r >= Rm) {
-    return 0.0f;
-  }
-  /* custom variation with extra sharpness, to match the previous code */
-  const float y = 1.0f / (1.0f + sharpness);
-  float Rmy, ry, ryinv;
-
-  Rmy = powf(Rm, y);
-  ry = powf(r, y);
-  ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f;
-
-  const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy;
-  const float f = Rmy - ry;
-  const float num = f * (f * f) * (y * ryinv);
-
-  return (10.0f * num) / (Rmy5 * M_PI);
-}
-
-static float eval_profile(float r, short falloff_type, float sharpness, float param)
+static float eval_profile(float r, float param)
 {
   r = fabsf(r);
-
-  if (ELEM(falloff_type, SHD_SUBSURFACE_BURLEY, SHD_SUBSURFACE_RANDOM_WALK)) {
-    return burley_profile(r, param) / BURLEY_TRUNCATE_CDF;
-  }
-  if (falloff_type == SHD_SUBSURFACE_CUBIC) {
-    return cubic_profile(r, param, sharpness);
-  }
-
-  return gaussian_profile(r, param);
+  return burley_profile(r, param) / BURLEY_TRUNCATE_CDF;
 }
 
 /* Resolution for each sample of the precomputed kernel profile */
 #define INTEGRAL_RESOLUTION 32
-static float eval_integral(float x0, float x1, short falloff_type, float sharpness, float param)
+static float eval_integral(float x0, float x1, float param)
 {
   const float range = x1 - x0;
   const float step = range / INTEGRAL_RESOLUTION;
@@ -333,7 +289,7 @@ static float eval_integral(float x0, float x1, short falloff_type, float sharpne
 
   for (int i = 0; i < INTEGRAL_RESOLUTION; i++) {
     float x = x0 + range * ((float)i + 0.5f) / (float)INTEGRAL_RESOLUTION;
-    float y = eval_profile(x, falloff_type, sharpness, param);
+    float y = eval_profile(x, param);
     integral += y * step;
   }
 
@@ -341,8 +297,7 @@ static float eval_integral(float x0, float x1, short falloff_type, float sharpne
 }
 #undef INTEGRAL_RESOLUTION
 
-static void compute_sss_kernel(
-    GPUSssKernelData *kd, const float radii[3], int sample_len, int falloff_type, float sharpness)
+static void compute_sss_kernel(GPUSssKernelData *kd, const float radii[3], int sample_len)
 {
   float rad[3];
   /* Minimum radius */
@@ -353,27 +308,15 @@ static void compute_sss_kernel(
   /* Christensen-Burley fitting */
   float l[3], d[3];
 
-  if (ELEM(falloff_type, SHD_SUBSURFACE_BURLEY, SHD_SUBSURFACE_RANDOM_WALK)) {
-    mul_v3_v3fl(l, rad, 0.25f * M_1_PI);
-    const float A = 1.0f;
-    const float s = 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
-    /* XXX 0.6f Out of nowhere to match cycles! Empirical! Can be tweak better. */
-    mul_v3_v3fl(d, l, 0.6f / s);
-    mul_v3_v3fl(rad, d, BURLEY_TRUNCATE);
-    kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
-
-    copy_v3_v3(kd->param, d);
-  }
-  else if (falloff_type == SHD_SUBSURFACE_CUBIC) {
-    copy_v3_v3(kd->param, rad);
-    mul_v3_fl(rad, 1.0f + sharpness);
-    kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
-  }
-  else {
-    kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
+  mul_v3_v3fl(l, rad, 0.25f * M_1_PI);
+  const float A = 1.0f;
+  const float s = 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
+  /* XXX 0.6f Out of nowhere to match cycles! Empirical! Can be tweak better. */
+  mul_v3_v3fl(d, l, 0.6f / s);
+  mul_v3_v3fl(rad, d, BURLEY_TRUNCATE);
+  kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
 
-    copy_v3_v3(kd->param, rad);
-  }
+  copy_v3_v3(kd->param, d);
 
   /* Compute samples locations on the 1d kernel [-1..1] */
   sss_calculate_offsets(kd, sample_len, SSS_EXPONENT);
@@ -403,9 +346,9 @@ static void compute_sss_kernel(
     x0 *= kd->max_radius;
     x1 *= kd->max_radius;
 
-    kd->kernel[i][0] = eval_integral(x0, x1, falloff_type, sharpness, kd->param[0]);
-    kd->kernel[i][1] = eval_integral(x0, x1, falloff_type, sharpness, kd->param[1]);
-    kd->kernel[i][2] = eval_integral(x0, x1, falloff_type, sharpness, kd->param[2]);
+    kd->kernel[i][0] = eval_integral(x0, x1, kd->param[0]);
+    kd->kernel[i][1] = eval_integral(x0, x1, kd->param[1]);
+    kd->kernel[i][2] = eval_integral(x0, x1, kd->param[2]);
 
     sum[0] += kd->kernel[i][0];
     sum[1] += kd->kernel[i][1];
@@ -439,8 +382,6 @@ static void compute_sss_kernel(
 #define INTEGRAL_RESOLUTION 512
 static void compute_sss_translucence_kernel(const GPUSssKernelData *kd,
                                             int resolution,
-                                            short falloff_type,
-                                            float sharpness,
                                             float **output)
 {
   float(*texels)[4];
@@ -463,9 +404,9 @@ static void compute_sss_translucence_kernel(const GPUSssKernelData *kd,
       float dist = hypotf(r + r_step * 0.5f, d);
 
       float profile[3];
-      profile[0] = eval_profile(dist, falloff_type, sharpness, kd->param[0]);
-      profile[1] = eval_profile(dist, falloff_type, sharpness, kd->param[1]);
-      profile[2] = eval_profile(dist, falloff_type, sharpness, kd->param[2]);
+      profile[0] = eval_profile(dist, kd->param[0]);
+      profile[1] = eval_profile(dist, kd->param[1]);
+      profile[2] = eval_profile(dist, kd->param[2]);
 
       /* Since the profile and configuration are radially symmetrical we
        * can just evaluate it once and weight it accordingly */
@@ -499,14 +440,9 @@ static void compute_sss_translucence_kernel(const GPUSssKernelData *kd,
 }
 #undef INTEGRAL_RESOLUTION
 
-void GPU_material_sss_profile_create(GPUMaterial *material,
-                                     float radii[3],
-                                     const short *falloff_type,
-                                     const float *sharpness)
+void GPU_material_sss_profile_create(GPUMaterial *material, float radii[3])
 {
   copy_v3_v3(material->sss_radii, radii);
-  material->sss_falloff = (falloff_type) ? *falloff_type : 0.0;
-  material->sss_sharpness = (sharpness) ? *sharpness : 0.0;
   material->sss_dirty = true;
   material->sss_enabled = true;
 
@@ -527,20 +463,14 @@ struct GPUUniformBuf *GPU_material_sss_profile_get(GPUMaterial *material,
   if (material->sss_dirty || (material->sss_samples != sample_len)) {
     GPUSssKernelData kd;
 
-    float sharpness = material->sss_sharpness;
-
-    /* XXX Black magic but it seems to fit. Maybe because we integrate -1..1 */
-    sharpness *= 0.5f;
-
-    compute_sss_kernel(&kd, material->sss_radii, sample_len, material->sss_falloff, sharpness);
+    compute_sss_kernel(&kd, material->sss_radii, sample_len);
 
     /* Update / Create UBO */
     GPU_uniformbuf_update(material->sss_profile, &kd);
 
     /* Update / Create Tex */
     float *translucence_profile;
-    compute_sss_translucence_kernel(
-        &kd, 64, material->sss_falloff, sharpness, &translucence_profile);
+    compute_sss_translucence_kernel(&kd, 64, &translucence_profile);
 
     if (material->sss_tex_profile != NULL) {
       GPU_texture_free(material->sss_tex_profile);
diff --git a/source/blender/gpu/intern/gpu_material_library.h b/source/blender/gpu/intern/gpu_material_library.h
index 782d89d6f2a..d3b12d3a2b7 100644
--- a/source/blender/gpu/intern/gpu_material_library.h
+++ b/source/blender/gpu/intern/gpu_material_library.h
@@ -27,7 +27,7 @@
 #include "GPU_material.h"
 
 #define MAX_FUNCTION_NAME 64
-#define MAX_PARAMETER 32
+#define MAX_PARAMETER 36
 
 struct GSet;
 
diff --git a/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl b/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl
index d77259638fd..bba84c2be52 100644
--- a/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl
+++ b/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl
@@ -19,6 +19,8 @@ void node_bsdf_principled(vec4 base_color,
                           float subsurface,
                           vec3 subsurface_radius,
                           vec4 subsurface_color,
+                          float subsurface_ior,
+                          float subsurface_anisotropy,
                           float metallic,
                           float specular,
                           float specular_tint,
@@ -201,6 +203,6 @@ void node_bsdf_principled(vec4 base_color,
 #else
 /* clang-format off */
 /* Stub principled because it is not compatible with volumetrics. */
-#  define node_bsdf_principled(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, bb, cc, dd, result) (result = CLOSURE_DEFAULT)
+#  define node_bsdf_principled(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, bb, cc, dd, ee, ff, result) (result = CLOSURE_DEFAULT)
 /* clang-format on */
 #endif
diff --git a/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl b/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl
index 5129bf71903..d0c159cdf37 100644
--- a/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl
+++ b/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl
@@ -5,8 +5,8 @@ CLOSURE_EVAL_FUNCTION_DECLARE_1(node_subsurface_scattering, Diffuse)
 void node_subsurface_scattering(vec4 color,
                                 float scale,
                                 vec3 radius,
-                                float sharpen,
-                                float texture_blur,
+                                float ior,
+                                float anisotropy,
                                 vec3 N,
                                 float sss_id,
                                 out Closure result)
@@ -20,15 +20,7 @@ void node_subsurface_scattering(vec4 color,
 
   result = CLOSURE_DEFAULT;
 
-  /* Not perfect for texture_blur values between 0.0 and 1.0.
-   * Interpolate between separated color and color applied on irradiance. */
-  float one_minus_texture_blur = 1.0 - texture_blur;
-  vec3 sss_albedo = color.rgb * one_minus_texture_blur + texture_blur;
-  vec3 radiance_tint = color.rgb * texture_blur + one_minus_texture_blur;
-  /* Consider output radiance as irradiance. */
-  out_Diffuse_0.radiance *= radiance_tint;
-
-  closure_load_sss_data(scale, out_Diffuse_0.radiance, sss_albedo, int(sss_id), result);
+  closure_load_sss_data(scale, out_Diffuse_0.radiance, color.rgb, int(sss_id), result);
 
   /* TODO(fclem) Try to not use this. */
   closure_load_ssr_data(vec3(0.0), 0.0, in_Diffuse_0.N, -1.0, result);
diff --git a/source/blender/makesdna/DNA_layer_types.h b/source/blender/makesdna/DNA_layer_types.h
index 63e4597150c..520f989452c 100644
--- a/source/blender/makesdna/DNA_layer_types.h
+++ b/source/blender/makesdna/DNA_layer_types.h
@@ -68,7 +68,7 @@ typedef enum eViewLayerCryptomatteFlags {
   VIEW_LAYER_CRYPTOMATTE_OBJECT = (1 << 0),
   VIEW_LAYER_CRYPTOMATTE_MATERIAL = (1 << 1),
   VIEW_LAYER_CRYPTOMATTE_ASSET = (1 << 2),
-  VIEW_LAYER_CRYPTOMATTE_ACCURATE = (1 << 3),
+  /* VIEW_LAYER_CRYPTOMATTE_ACCURATE = (1 << 3), */ /* DEPRECATED */
 } eViewLayerCryptomatteFlags;
 #define VIEW_LAYER_CRYPTOMATTE_ALL \
   (VIEW_LAYER_CRYPTOMATTE_OBJECT | VIEW_LAYER_CRYPTOMATTE_MATERIAL | VIEW_LAYER_CRYPTOMATTE_ASSET)
diff --git a/source/blender/makesdna/DNA_node_types.h b/source/blender/makesdna/DNA_node_types.h
index 49083542fd7..cf159a1e28d 100644
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@@ -1032,6 +1032,11 @@ typedef struct NodeShaderTexPointDensity {
   char _pad2[4];
 } NodeShaderTexPointDensity;
 
+typedef struct NodeShaderPrincipled {
+  char use_subsurface_auto_radius;
+  char _pad[3];
+} NodeShaderPrincipled;
+
 /* TEX_output */
 typedef struct TexNodeOutput {
   char name[64];
@@ -1803,11 +1808,12 @@ enum {
 enum {
 #ifdef DNA_DEPRECATED_ALLOW
   SHD_SUBSURFACE_COMPATIBLE = 0, /* Deprecated */
-#endif
   SHD_SUBSURFACE_CUBIC = 1,
   SHD_SUBSURFACE_GAUSSIAN = 2,
-  SHD_SUBSURFACE_BURLEY = 3,
-  SHD_SUBSURFACE_RANDOM_WALK = 4,
+#endif
+  SHD_SUBSURFACE_DIFFUSION = 3,
+  SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS = 4,
+  SHD_SUBSURFACE_RANDOM_WALK = 5,
 };
 
 /* blur node */
diff --git a/source/blender/makesdna/DNA_scene_defaults.h b/source/blender/makesdna/DNA_scene_defaults.h
index 61707964191..9ecf94ebd6e 100644
--- a/source/blender/makesdna/DNA_scene_defaults.h
+++ b/source/blender/makesdna/DNA_scene_defaults.h
@@ -135,8 +135,6 @@
     .border.xmax = 1.0f, \
     .border.ymax = 1.0f, \
  \
-    .preview_start_resolution = 64, \
- \
     .line_thickness_mode = R_LINE_THICKNESS_ABSOLUTE, \
     .unit_line_thickness = 1.0f, \
  \
diff --git a/source/blender/makesdna/DNA_scene_types.h b/source/blender/makesdna/DNA_scene_types.h
index f2244b4ae61..b28c3ac2b85 100644
--- a/source/blender/makesdna/DNA_scene_types.h
+++ b/source/blender/makesdna/DNA_scene_types.h
@@ -261,7 +261,7 @@ typedef enum eScenePassType {
   SCE_PASS_UNUSED_3 = (1 << 4), /* SPEC */
   SCE_PASS_SHADOW = (1 << 5),
   SCE_PASS_AO = (1 << 6),
-  SCE_PASS_UNUSED_4 = (1 << 7), /* REFLECT */
+  SCE_PASS_POSITION = (1 << 7),
   SCE_PASS_NORMAL = (1 << 8),
   SCE_PASS_VECTOR = (1 << 9),
   SCE_PASS_UNUSED_5 = (1 << 10), /* REFRACT */
@@ -293,6 +293,7 @@ typedef enum eScenePassType {
 #define RE_PASSNAME_COMBINED "Combined"
 #define RE_PASSNAME_Z "Depth"
 #define RE_PASSNAME_VECTOR "Vector"
+#define RE_PASSNAME_POSITION "Position"
 #define RE_PASSNAME_NORMAL "Normal"
 #define RE_PASSNAME_UV "UV"
 #define RE_PASSNAME_EMIT "Emit"
@@ -592,7 +593,7 @@ typedef enum eBakeSaveMode {
 /** #BakeData.pass_filter */
 typedef enum eBakePassFilter {
   R_BAKE_PASS_FILTER_NONE = 0,
-  R_BAKE_PASS_FILTER_AO = (1 << 0),
+  R_BAKE_PASS_FILTER_UNUSED = (1 << 0),
   R_BAKE_PASS_FILTER_EMIT = (1 << 1),
   R_BAKE_PASS_FILTER_DIFFUSE = (1 << 2),
   R_BAKE_PASS_FILTER_GLOSSY = (1 << 3),
@@ -653,7 +654,8 @@ typedef struct RenderData {
   /**
    * render tile dimensions
    */
-  int tilex, tiley;
+  int tilex DNA_DEPRECATED;
+  int tiley DNA_DEPRECATED;
 
   short planes DNA_DEPRECATED;
   short imtype DNA_DEPRECATED;
@@ -764,13 +766,10 @@ typedef struct RenderData {
   /* Cycles baking */
   struct BakeData bake;
 
-  int preview_start_resolution;
+  int _pad8;
   short preview_pixel_size;
 
-  /* Type of the debug pass to use.
-   * Only used when built with debug passes support.
-   */
-  short debug_pass_type;
+  short _pad4;
 
   /* MultiView */
   /** SceneRenderView. */
@@ -1887,12 +1886,12 @@ enum {
 #define R_COMP_CROP (1 << 7)
 #define R_SCEMODE_UNUSED_8 (1 << 8) /* cleared */
 #define R_SINGLE_LAYER (1 << 9)
-#define R_EXR_TILE_FILE (1 << 10)
+#define R_SCEMODE_UNUSED_10 (1 << 10) /* cleared */
 #define R_SCEMODE_UNUSED_11 (1 << 11) /* cleared */
 #define R_NO_IMAGE_LOAD (1 << 12)
 #define R_SCEMODE_UNUSED_13 (1 << 13) /* cleared */
 #define R_NO_FRAME_UPDATE (1 << 14)
-#define R_FULL_SAMPLE (1 << 15)
+#define R_SCEMODE_UNUSED_15 (1 << 15) /* cleared */
 #define R_SCEMODE_UNUSED_16 (1 << 16) /* cleared */
 #define R_SCEMODE_UNUSED_17 (1 << 17) /* cleared */
 #define R_TEXNODE_PREVIEW (1 << 18)
diff --git a/source/blender/makesrna/intern/rna_nodetree.c b/source/blender/makesrna/intern/rna_nodetree.c
index d0bf60d5d02..ec53f35df4c 100644
--- a/source/blender/makesrna/intern/rna_nodetree.c
+++ b/source/blender/makesrna/intern/rna_nodetree.c
@@ -4665,16 +4665,18 @@ static const EnumPropertyItem node_principled_distribution_items[] = {
 };
 
 static const EnumPropertyItem node_subsurface_method_items[] = {
-    {SHD_SUBSURFACE_BURLEY,
-     "BURLEY",
+    {SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS,
+     "RANDOM_WALK_FIXED_RADIUS",
      0,
-     "Christensen-Burley",
-     "Approximation to physically based volume scattering"},
+     "Random Walk (Fixed Radius)",
+     "Volumetric approximation to physically based volume scattering, using the scattering radius "
+     "as specified"},
     {SHD_SUBSURFACE_RANDOM_WALK,
      "RANDOM_WALK",
      0,
      "Random Walk",
-     "Volumetric approximation to physically based volume scattering"},
+     "Volumetric approximation to physically based volume scattering, with scattering radius "
+     "automatically adjusted to match color textures"},
     {0, NULL, 0, NULL, NULL}};
 
 /* -- Common nodes ---------------------------------------------------------- */
@@ -6144,35 +6146,12 @@ static void def_sh_ambient_occlusion(StructRNA *srna)
 
 static void def_sh_subsurface(StructRNA *srna)
 {
-  static const EnumPropertyItem prop_subsurface_falloff_items[] = {
-      {SHD_SUBSURFACE_CUBIC, "CUBIC", 0, "Cubic", "Simple cubic falloff function"},
-      {SHD_SUBSURFACE_GAUSSIAN,
-       "GAUSSIAN",
-       0,
-       "Gaussian",
-       "Normal distribution, multiple can be combined to fit more complex profiles"},
-      {SHD_SUBSURFACE_BURLEY,
-       "BURLEY",
-       0,
-       "Christensen-Burley",
-       "Approximation to physically based volume scattering"},
-      {SHD_SUBSURFACE_RANDOM_WALK,
-       "RANDOM_WALK",
-       0,
-       "Random Walk",
-       "Volumetric approximation to physically based volume scattering"},
-      {0, NULL, 0, NULL, NULL},
-  };
-
   PropertyRNA *prop;
 
   prop = RNA_def_property(srna, "falloff", PROP_ENUM, PROP_NONE);
   RNA_def_property_enum_sdna(prop, NULL, "custom1");
-  RNA_def_property_enum_items(prop, prop_subsurface_falloff_items);
-  RNA_def_property_ui_text(prop,
-                           "Falloff",
-                           "Function to determine how much light nearby points contribute based "
-                           "on their distance to the shading point");
+  RNA_def_property_enum_items(prop, node_subsurface_method_items);
+  RNA_def_property_ui_text(prop, "Method", "Method for rendering subsurface scattering");
   RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_ShaderNode_socket_update");
 }
 
diff --git a/source/blender/makesrna/intern/rna_render.c b/source/blender/makesrna/intern/rna_render.c
index 4400d198b4a..fcb46904e8d 100644
--- a/source/blender/makesrna/intern/rna_render.c
+++ b/source/blender/makesrna/intern/rna_render.c
@@ -52,6 +52,7 @@ const EnumPropertyItem rna_enum_render_pass_type_items[] = {
     {SCE_PASS_Z, "Z", 0, "Z", ""},
     {SCE_PASS_SHADOW, "SHADOW", 0, "Shadow", ""},
     {SCE_PASS_AO, "AO", 0, "Ambient Occlusion", ""},
+    {SCE_PASS_POSITION, "POSITION", 0, "Position", ""},
     {SCE_PASS_NORMAL, "NORMAL", 0, "Normal", ""},
     {SCE_PASS_VECTOR, "VECTOR", 0, "Vector", ""},
     {SCE_PASS_INDEXOB, "OBJECT_INDEX", 0, "Object Index", ""},
@@ -79,6 +80,7 @@ const EnumPropertyItem rna_enum_bake_pass_type_items[] = {
     {SCE_PASS_COMBINED, "COMBINED", 0, "Combined", ""},
     {SCE_PASS_AO, "AO", 0, "Ambient Occlusion", ""},
     {SCE_PASS_SHADOW, "SHADOW", 0, "Shadow", ""},
+    {SCE_PASS_POSITION, "POSITION", 0, "Position", ""},
     {SCE_PASS_NORMAL, "NORMAL", 0, "Normal", ""},
     {SCE_PASS_UV, "UV", 0, "UV", ""},
     {SCE_PASS_ROUGHNESS, "ROUGHNESS", 0, "ROUGHNESS", ""},
@@ -177,6 +179,40 @@ static void engine_render(RenderEngine *engine, Depsgraph *depsgraph)
   RNA_parameter_list_free(&list);
 }
 
+static void engine_render_frame_finish(RenderEngine *engine)
+{
+  extern FunctionRNA rna_RenderEngine_render_frame_finish_func;
+  PointerRNA ptr;
+  ParameterList list;
+  FunctionRNA *func;
+
+  RNA_pointer_create(NULL, engine->type->rna_ext.srna, engine, &ptr);
+  func = &rna_RenderEngine_render_frame_finish_func;
+
+  RNA_parameter_list_create(&list, &ptr, func);
+  engine->type->rna_ext.call(NULL, &ptr, func, &list);
+
+  RNA_parameter_list_free(&list);
+}
+
+static void engine_draw(RenderEngine *engine, const struct bContext *context, Depsgraph *depsgraph)
+{
+  extern FunctionRNA rna_RenderEngine_draw_func;
+  PointerRNA ptr;
+  ParameterList list;
+  FunctionRNA *func;
+
+  RNA_pointer_create(NULL, engine->type->rna_ext.srna, engine, &ptr);
+  func = &rna_RenderEngine_draw_func;
+
+  RNA_parameter_list_create(&list, &ptr, func);
+  RNA_parameter_set_lookup(&list, "context", &context);
+  RNA_parameter_set_lookup(&list, "depsgraph", &depsgraph);
+  engine->type->rna_ext.call(NULL, &ptr, func, &list);
+
+  RNA_parameter_list_free(&list);
+}
+
 static void engine_bake(RenderEngine *engine,
                         struct Depsgraph *depsgraph,
                         struct Object *object,
@@ -315,7 +351,7 @@ static StructRNA *rna_RenderEngine_register(Main *bmain,
   RenderEngineType *et, dummyet = {NULL};
   RenderEngine dummyengine = {NULL};
   PointerRNA dummyptr;
-  int have_function[8];
+  int have_function[9];
 
   /* setup dummy engine & engine type to store static properties in */
   dummyengine.type = &dummyet;
@@ -358,11 +394,13 @@ static StructRNA *rna_RenderEngine_register(Main *bmain,
 
   et->update = (have_function[0]) ? engine_update : NULL;
   et->render = (have_function[1]) ? engine_render : NULL;
-  et->bake = (have_function[2]) ? engine_bake : NULL;
-  et->view_update = (have_function[3]) ? engine_view_update : NULL;
-  et->view_draw = (have_function[4]) ? engine_view_draw : NULL;
-  et->update_script_node = (have_function[5]) ? engine_update_script_node : NULL;
-  et->update_render_passes = (have_function[6]) ? engine_update_render_passes : NULL;
+  et->render_frame_finish = (have_function[2]) ? engine_render_frame_finish : NULL;
+  et->draw = (have_function[3]) ? engine_draw : NULL;
+  et->bake = (have_function[4]) ? engine_bake : NULL;
+  et->view_update = (have_function[5]) ? engine_view_update : NULL;
+  et->view_draw = (have_function[6]) ? engine_view_draw : NULL;
+  et->update_script_node = (have_function[7]) ? engine_update_script_node : NULL;
+  et->update_render_passes = (have_function[8]) ? engine_update_render_passes : NULL;
 
   RE_engines_register(et);
 
@@ -519,6 +557,19 @@ static void rna_def_render_engine(BlenderRNA *brna)
   parm = RNA_def_pointer(func, "depsgraph", "Depsgraph", "", "");
   RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
 
+  func = RNA_def_function(srna, "render_frame_finish", NULL);
+  RNA_def_function_ui_description(
+      func, "Perform finishing operations after all view layers in a frame were rendered");
+  RNA_def_function_flag(func, FUNC_REGISTER_OPTIONAL | FUNC_ALLOW_WRITE);
+
+  func = RNA_def_function(srna, "draw", NULL);
+  RNA_def_function_ui_description(func, "Draw render image");
+  RNA_def_function_flag(func, FUNC_REGISTER_OPTIONAL);
+  parm = RNA_def_pointer(func, "context", "Context", "", "");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_pointer(func, "depsgraph", "Depsgraph", "", "");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+
   func = RNA_def_function(srna, "bake", NULL);
   RNA_def_function_ui_description(func, "Bake passes");
   RNA_def_function_flag(func, FUNC_REGISTER_OPTIONAL | FUNC_ALLOW_WRITE);
@@ -641,6 +692,14 @@ static void rna_def_render_engine(BlenderRNA *brna)
   parm = RNA_def_boolean(func, "do_break", 0, "Break", "");
   RNA_def_function_return(func, parm);
 
+  func = RNA_def_function(srna, "pass_by_index_get", "RE_engine_pass_by_index_get");
+  parm = RNA_def_string(func, "layer", NULL, 0, "Layer", "Name of render layer to get pass for");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "index", 0, 0, INT_MAX, "Index", "Index of pass to get", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_pointer(func, "render_pass", "RenderPass", "Index", "Index of pass to get");
+  RNA_def_function_return(func, parm);
+
   func = RNA_def_function(srna, "active_view_get", "RE_engine_active_view_get");
   parm = RNA_def_string(func, "view", NULL, 0, "View", "Single view active");
   RNA_def_function_return(func, parm);
@@ -761,6 +820,22 @@ static void rna_def_render_engine(BlenderRNA *brna)
   func = RNA_def_function(srna, "free_blender_memory", "RE_engine_free_blender_memory");
   RNA_def_function_ui_description(func, "Free Blender side memory of render engine");
 
+  func = RNA_def_function(srna, "tile_highlight_set", "RE_engine_tile_highlight_set");
+  RNA_def_function_ui_description(func, "Set highlighted state of the given tile");
+  parm = RNA_def_int(func, "x", 0, 0, INT_MAX, "X", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "y", 0, 0, INT_MAX, "Y", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "width", 0, 0, INT_MAX, "Width", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "height", 0, 0, INT_MAX, "Height", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_boolean(func, "highlight", 0, "Highlight", "");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+
+  func = RNA_def_function(srna, "tile_highlight_clear_all", "RE_engine_tile_highlight_clear_all");
+  RNA_def_function_ui_description(func, "Clear highlight from all tiles");
+
   RNA_define_verify_sdna(0);
 
   prop = RNA_def_property(srna, "is_animation", PROP_BOOLEAN, PROP_NONE);
@@ -777,11 +852,6 @@ static void rna_def_render_engine(BlenderRNA *brna)
   RNA_def_property_boolean_sdna(prop, NULL, "layer_override", 1);
   RNA_def_property_array(prop, 20);
 
-  prop = RNA_def_property(srna, "tile_x", PROP_INT, PROP_UNSIGNED);
-  RNA_def_property_int_sdna(prop, NULL, "tile_x");
-  prop = RNA_def_property(srna, "tile_y", PROP_INT, PROP_UNSIGNED);
-  RNA_def_property_int_sdna(prop, NULL, "tile_y");
-
   prop = RNA_def_property(srna, "resolution_x", PROP_INT, PROP_PIXEL);
   RNA_def_property_int_sdna(prop, NULL, "resolution_x");
   RNA_def_property_clear_flag(prop, PROP_EDITABLE);
@@ -880,12 +950,6 @@ static void rna_def_render_engine(BlenderRNA *brna)
                            "Don't expose Cycles and Eevee shading nodes in the node editor user "
                            "interface, so own nodes can be used instead");
 
-  prop = RNA_def_property(srna, "bl_use_save_buffers", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "type->flag", RE_USE_SAVE_BUFFERS);
-  RNA_def_property_flag(prop, PROP_REGISTER_OPTIONAL);
-  RNA_def_property_ui_text(
-      prop, "Use Save Buffers", "Support render to an on disk buffer during rendering");
-
   prop = RNA_def_property(srna, "bl_use_spherical_stereo", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "type->flag", RE_USE_SPHERICAL_STEREO);
   RNA_def_property_flag(prop, PROP_REGISTER_OPTIONAL);
diff --git a/source/blender/makesrna/intern/rna_scene.c b/source/blender/makesrna/intern/rna_scene.c
index 1762b964f8d..e45d39a1ddc 100644
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@@ -532,7 +532,6 @@ const EnumPropertyItem rna_enum_stereo3d_interlace_type_items[] = {
 
 const EnumPropertyItem rna_enum_bake_pass_filter_type_items[] = {
     {R_BAKE_PASS_FILTER_NONE, "NONE", 0, "None", ""},
-    {R_BAKE_PASS_FILTER_AO, "AO", 0, "Ambient Occlusion", ""},
     {R_BAKE_PASS_FILTER_EMIT, "EMIT", 0, "Emit", ""},
     {R_BAKE_PASS_FILTER_DIRECT, "DIRECT", 0, "Direct", ""},
     {R_BAKE_PASS_FILTER_INDIRECT, "INDIRECT", 0, "Indirect", ""},
@@ -4151,13 +4150,6 @@ void rna_def_view_layer_common(BlenderRNA *brna, StructRNA *srna, const bool sce
         prop, "Cryptomatte Levels", "Sets how many unique objects can be distinguished per pixel");
     RNA_def_property_ui_range(prop, 2.0, 16.0, 2.0, 0.0);
     RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_ViewLayer_pass_update");
-
-    prop = RNA_def_property(srna, "use_pass_cryptomatte_accurate", PROP_BOOLEAN, PROP_NONE);
-    RNA_def_property_boolean_sdna(prop, NULL, "cryptomatte_flag", VIEW_LAYER_CRYPTOMATTE_ACCURATE);
-    RNA_def_property_boolean_default(prop, true);
-    RNA_def_property_ui_text(
-        prop, "Cryptomatte Accurate", "Generate a more accurate cryptomatte pass");
-    RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_ViewLayer_pass_update");
   }
 
   prop = RNA_def_property(srna, "use_solid", PROP_BOOLEAN, PROP_NONE);
@@ -4251,6 +4243,16 @@ void rna_def_view_layer_common(BlenderRNA *brna, StructRNA *srna, const bool sce
     RNA_def_property_clear_flag(prop, PROP_EDITABLE);
   }
 
+  prop = RNA_def_property(srna, "use_pass_position", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "passflag", SCE_PASS_POSITION);
+  RNA_def_property_ui_text(prop, "Position", "Deliver position pass");
+  if (scene) {
+    RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_ViewLayer_pass_update");
+  }
+  else {
+    RNA_def_property_clear_flag(prop, PROP_EDITABLE);
+  }
+
   prop = RNA_def_property(srna, "use_pass_normal", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "passflag", SCE_PASS_NORMAL);
   RNA_def_property_ui_text(prop, "Normal", "Deliver normal pass");
@@ -5122,10 +5124,6 @@ static void rna_def_bake_data(BlenderRNA *brna)
   RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
 
   /* custom passes flags */
-  prop = RNA_def_property(srna, "use_pass_ambient_occlusion", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "pass_filter", R_BAKE_PASS_FILTER_AO);
-  RNA_def_property_ui_text(prop, "Ambient Occlusion", "Add ambient occlusion contribution");
-
   prop = RNA_def_property(srna, "use_pass_emit", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "pass_filter", R_BAKE_PASS_FILTER_EMIT);
   RNA_def_property_ui_text(prop, "Emit", "Add emission contribution");
@@ -5934,29 +5932,6 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
   RNA_def_property_ui_text(prop, "Resolution %", "Percentage scale for render resolution");
   RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_SceneSequencer_update");
 
-  prop = RNA_def_property(srna, "tile_x", PROP_INT, PROP_PIXEL);
-  RNA_def_property_int_sdna(prop, NULL, "tilex");
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_range(prop, 8, 65536);
-  RNA_def_property_ui_text(prop, "Tile X", "Horizontal tile size to use while rendering");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
-  prop = RNA_def_property(srna, "tile_y", PROP_INT, PROP_PIXEL);
-  RNA_def_property_int_sdna(prop, NULL, "tiley");
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_range(prop, 8, 65536);
-  RNA_def_property_ui_text(prop, "Tile Y", "Vertical tile size to use while rendering");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
-  prop = RNA_def_property(srna, "preview_start_resolution", PROP_INT, PROP_NONE);
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_range(prop, 8, 16384);
-  RNA_def_property_ui_text(prop,
-                           "Start Resolution",
-                           "Resolution to start rendering preview at, "
-                           "progressively increasing it to the full viewport size");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
   prop = RNA_def_property(srna, "preview_pixel_size", PROP_ENUM, PROP_NONE);
   RNA_def_property_enum_sdna(prop, NULL, "preview_pixel_size");
   RNA_def_property_enum_items(prop, pixel_size_items);
@@ -6213,24 +6188,6 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
   RNA_def_property_clear_flag(prop, PROP_EDITABLE);
   RNA_def_property_ui_text(prop, "Movie Format", "When true the format is a movie");
 
-  prop = RNA_def_property(srna, "use_save_buffers", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "scemode", R_EXR_TILE_FILE);
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_ui_text(
-      prop,
-      "Save Buffers",
-      "Save tiles for all RenderLayers and SceneNodes to files in the temp directory "
-      "(saves memory, required for Full Sample)");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
-  prop = RNA_def_property(srna, "use_full_sample", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "scemode", R_FULL_SAMPLE);
-  RNA_def_property_ui_text(prop,
-                           "Full Sample",
-                           "Save for every anti-aliasing sample the entire RenderLayer results "
-                           "(this solves anti-aliasing issues with compositing)");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
   prop = RNA_def_property(srna, "use_lock_interface", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "use_lock_interface", 1);
   RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
diff --git a/source/blender/nodes/composite/nodes/node_composite_image.c b/source/blender/nodes/composite/nodes/node_composite_image.c
index 243300b0a44..a56dfea9dbf 100644
--- a/source/blender/nodes/composite/nodes/node_composite_image.c
+++ b/source/blender/nodes/composite/nodes/node_composite_image.c
@@ -45,7 +45,7 @@ static bNodeSocketTemplate cmp_node_rlayers_out[] = {
     {SOCK_VECTOR, N_(RE_PASSNAME_NORMAL), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_VECTOR, N_(RE_PASSNAME_UV), 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_VECTOR, N_(RE_PASSNAME_VECTOR), 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
-    {SOCK_RGBA, N_(RE_PASSNAME_DEPRECATED), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
+    {SOCK_VECTOR, N_(RE_PASSNAME_POSITION), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_RGBA, N_(RE_PASSNAME_DEPRECATED), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_RGBA, N_(RE_PASSNAME_DEPRECATED), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_RGBA, N_(RE_PASSNAME_SHADOW), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
@@ -72,7 +72,7 @@ static bNodeSocketTemplate cmp_node_rlayers_out[] = {
     {SOCK_RGBA, N_(RE_PASSNAME_SUBSURFACE_COLOR), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {-1, ""},
 };
-#define MAX_LEGACY_SOCKET_INDEX 30
+#define NUM_LEGACY_SOCKETS (ARRAY_SIZE(cmp_node_rlayers_out) - 1)
 
 static void cmp_node_image_add_pass_output(bNodeTree *ntree,
                                            bNode *node,
@@ -382,7 +382,7 @@ static void cmp_node_image_verify_outputs(bNodeTree *ntree, bNode *node, bool rl
           break;
         }
       }
-      if (!link && (!rlayer || sock_index > MAX_LEGACY_SOCKET_INDEX)) {
+      if (!link && (!rlayer || sock_index >= NUM_LEGACY_SOCKETS)) {
         MEM_freeN(sock->storage);
         nodeRemoveSocket(ntree, node, sock);
       }
@@ -468,43 +468,12 @@ void node_cmp_rlayers_outputs(bNodeTree *ntree, bNode *node)
 
 const char *node_cmp_rlayers_sock_to_pass(int sock_index)
 {
-  const char *sock_to_passname[] = {
-      RE_PASSNAME_COMBINED,
-      RE_PASSNAME_COMBINED,
-      RE_PASSNAME_Z,
-      RE_PASSNAME_NORMAL,
-      RE_PASSNAME_UV,
-      RE_PASSNAME_VECTOR,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_SHADOW,
-      RE_PASSNAME_AO,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_INDEXOB,
-      RE_PASSNAME_INDEXMA,
-      RE_PASSNAME_MIST,
-      RE_PASSNAME_EMIT,
-      RE_PASSNAME_ENVIRONMENT,
-      RE_PASSNAME_DIFFUSE_DIRECT,
-      RE_PASSNAME_DIFFUSE_INDIRECT,
-      RE_PASSNAME_DIFFUSE_COLOR,
-      RE_PASSNAME_GLOSSY_DIRECT,
-      RE_PASSNAME_GLOSSY_INDIRECT,
-      RE_PASSNAME_GLOSSY_COLOR,
-      RE_PASSNAME_TRANSM_DIRECT,
-      RE_PASSNAME_TRANSM_INDIRECT,
-      RE_PASSNAME_TRANSM_COLOR,
-      RE_PASSNAME_SUBSURFACE_DIRECT,
-      RE_PASSNAME_SUBSURFACE_INDIRECT,
-      RE_PASSNAME_SUBSURFACE_COLOR,
-  };
-  if (sock_index > MAX_LEGACY_SOCKET_INDEX) {
+  if (sock_index >= NUM_LEGACY_SOCKETS) {
     return NULL;
   }
-  return sock_to_passname[sock_index];
+  const char *name = cmp_node_rlayers_out[sock_index].name;
+  /* Exception for alpha, which is derived from Combined. */
+  return (STREQ(name, "Alpha")) ? RE_PASSNAME_COMBINED : name;
 }
 
 static void node_composit_init_rlayers(const bContext *C, PointerRNA *ptr)
diff --git a/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c b/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
index f601f3e9fd0..06f4d1f1b79 100644
--- a/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
+++ b/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
@@ -35,6 +35,8 @@ static bNodeSocketTemplate sh_node_bsdf_principled_in[] = {
      PROP_NONE,
      SOCK_COMPACT},
     {SOCK_RGBA, N_("Subsurface Color"), 0.8f, 0.8f, 0.8f, 1.0f, 0.0f, 1.0f},
+    {SOCK_FLOAT, N_("Subsurface IOR"), 1.4f, 0.0f, 0.0f, 0.0f, 1.01f, 3.8f, PROP_FACTOR},
+    {SOCK_FLOAT, N_("Subsurface Anisotropy"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_FLOAT, N_("Metallic"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_FLOAT, N_("Specular"), 0.5f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_FLOAT, N_("Specular Tint"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
@@ -74,7 +76,7 @@ static bNodeSocketTemplate sh_node_bsdf_principled_out[] = {
 static void node_shader_init_principled(bNodeTree *UNUSED(ntree), bNode *node)
 {
   node->custom1 = SHD_GLOSSY_GGX;
-  node->custom2 = SHD_SUBSURFACE_BURLEY;
+  node->custom2 = SHD_SUBSURFACE_RANDOM_WALK;
 }
 
 #define socket_not_zero(sock) (in[sock].link || (clamp_f(in[sock].vec[0], 0.0f, 1.0f) > 1e-5f))
@@ -90,41 +92,40 @@ static int node_shader_gpu_bsdf_principled(GPUMaterial *mat,
   GPUNodeLink *sss_scale;
 
   /* Normals */
-  if (!in[20].link) {
-    GPU_link(mat, "world_normals_get", &in[20].link);
+  if (!in[22].link) {
+    GPU_link(mat, "world_normals_get", &in[22].link);
   }
 
   /* Clearcoat Normals */
-  if (!in[21].link) {
-    GPU_link(mat, "world_normals_get", &in[21].link);
+  if (!in[23].link) {
+    GPU_link(mat, "world_normals_get", &in[23].link);
   }
 
 #if 0 /* Not used at the moment. */
   /* Tangents */
-  if (!in[22].link) {
+  if (!in[24].link) {
     GPUNodeLink *orco = GPU_attribute(CD_ORCO, "");
-    GPU_link(mat, "tangent_orco_z", orco, &in[22].link);
+    GPU_link(mat, "tangent_orco_z", orco, &in[24].link);
     GPU_link(mat,
              "node_tangent",
              GPU_builtin(GPU_WORLD_NORMAL),
-             in[22].link,
+             in[24].link,
              GPU_builtin(GPU_OBJECT_MATRIX),
-             &in[22].link);
+             &in[24].link);
   }
 #endif
 
-  bool use_diffuse = socket_not_one(4) && socket_not_one(15);
+  bool use_diffuse = socket_not_one(6) && socket_not_one(17);
   bool use_subsurf = socket_not_zero(1) && use_diffuse && node->sss_id > 0;
-  bool use_refract = socket_not_one(4) && socket_not_zero(15);
-  bool use_clear = socket_not_zero(12);
+  bool use_refract = socket_not_one(6) && socket_not_zero(17);
+  bool use_clear = socket_not_zero(14);
 
   /* SSS Profile */
   if (use_subsurf) {
-    static short profile = SHD_SUBSURFACE_BURLEY;
     bNodeSocket *socket = BLI_findlink(&node->original->inputs, 2);
     bNodeSocketValueRGBA *socket_data = socket->default_value;
     /* For some reason it seems that the socket value is in ARGB format. */
-    GPU_material_sss_profile_create(mat, &socket_data->value[1], &profile, NULL);
+    GPU_material_sss_profile_create(mat, &socket_data->value[1]);
   }
 
   if (in[2].link) {
diff --git a/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c b/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c
index 4b91bcbd11c..e917858e0f2 100644
--- a/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c
+++ b/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c
@@ -25,8 +25,8 @@ static bNodeSocketTemplate sh_node_subsurface_scattering_in[] = {
     {SOCK_RGBA, N_("Color"), 0.8f, 0.8f, 0.8f, 1.0f, 0.0f, 1.0f},
     {SOCK_FLOAT, N_("Scale"), 1.0, 0.0f, 0.0f, 0.0f, 0.0f, 1000.0f},
     {SOCK_VECTOR, N_("Radius"), 1.0f, 0.2f, 0.1f, 0.0f, 0.0f, 100.0f, PROP_NONE, SOCK_COMPACT},
-    {SOCK_FLOAT, N_("Sharpness"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
-    {SOCK_FLOAT, N_("Texture Blur"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
+    {SOCK_FLOAT, N_("IOR"), 1.4f, 0.0f, 0.0f, 0.0f, 1.01f, 3.8f, PROP_FACTOR},
+    {SOCK_FLOAT, N_("Anisotropy"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_VECTOR, N_("Normal"), 0.0f, 0.0f, 0.0f, 1.0f, -1.0f, 1.0f, PROP_NONE, SOCK_HIDE_VALUE},
     {-1, ""},
 };
@@ -38,7 +38,8 @@ static bNodeSocketTemplate sh_node_subsurface_scattering_out[] = {
 
 static void node_shader_init_subsurface_scattering(bNodeTree *UNUSED(ntree), bNode *node)
 {
-  node->custom1 = SHD_SUBSURFACE_BURLEY;
+  node->custom1 = SHD_SUBSURFACE_RANDOM_WALK;
+  node->custom2 = true;
 }
 
 static int node_shader_gpu_subsurface_scattering(GPUMaterial *mat,
@@ -54,11 +55,8 @@ static int node_shader_gpu_subsurface_scattering(GPUMaterial *mat,
   if (node->sss_id > 0) {
     bNodeSocket *socket = BLI_findlink(&node->original->inputs, 2);
     bNodeSocketValueRGBA *socket_data = socket->default_value;
-    bNodeSocket *socket_sharp = BLI_findlink(&node->original->inputs, 3);
-    bNodeSocketValueFloat *socket_data_sharp = socket_sharp->default_value;
     /* For some reason it seems that the socket value is in ARGB format. */
-    GPU_material_sss_profile_create(
-        mat, &socket_data->value[1], &node->original->custom1, &socket_data_sharp->value);
+    GPU_material_sss_profile_create(mat, &socket_data->value[1]);
 
     /* sss_id is 0 only the node is not connected to any output.
      * In this case flagging the material would trigger a bug (see T68736). */
@@ -69,23 +67,6 @@ static int node_shader_gpu_subsurface_scattering(GPUMaterial *mat,
       mat, node, "node_subsurface_scattering", in, out, GPU_constant(&node->sss_id));
 }
 
-static void node_shader_update_subsurface_scattering(bNodeTree *UNUSED(ntree), bNode *node)
-{
-  bNodeSocket *sock;
-  int falloff = node->custom1;
-
-  for (sock = node->inputs.first; sock; sock = sock->next) {
-    if (STREQ(sock->name, "Sharpness")) {
-      if (falloff == SHD_SUBSURFACE_CUBIC) {
-        sock->flag &= ~SOCK_UNAVAIL;
-      }
-      else {
-        sock->flag |= SOCK_UNAVAIL;
-      }
-    }
-  }
-}
-
 /* node type definition */
 void register_node_type_sh_subsurface_scattering(void)
 {
@@ -99,7 +80,6 @@ void register_node_type_sh_subsurface_scattering(void)
   node_type_init(&ntype, node_shader_init_subsurface_scattering);
   node_type_storage(&ntype, "", NULL, NULL);
   node_type_gpu(&ntype, node_shader_gpu_subsurface_scattering);
-  node_type_update(&ntype, node_shader_update_subsurface_scattering);
 
   nodeRegisterType(&ntype);
 }
diff --git a/source/blender/render/CMakeLists.txt b/source/blender/render/CMakeLists.txt
index 0046474d064..494415a4077 100644
--- a/source/blender/render/CMakeLists.txt
+++ b/source/blender/render/CMakeLists.txt
@@ -59,7 +59,6 @@ set(SRC
   RE_pipeline.h
   RE_texture.h
 
-  intern/initrender.h
   intern/pipeline.h
   intern/render_result.h
   intern/render_types.h
diff --git a/source/blender/render/RE_engine.h b/source/blender/render/RE_engine.h
index dfc0d5d0e9f..2a3a5964262 100644
--- a/source/blender/render/RE_engine.h
+++ b/source/blender/render/RE_engine.h
@@ -40,6 +40,7 @@ struct RenderData;
 struct RenderEngine;
 struct RenderEngineType;
 struct RenderLayer;
+struct RenderPass;
 struct RenderResult;
 struct ReportList;
 struct Scene;
@@ -59,7 +60,7 @@ extern "C" {
 #define RE_USE_PREVIEW 4
 #define RE_USE_POSTPROCESS 8
 #define RE_USE_EEVEE_VIEWPORT 16
-#define RE_USE_SAVE_BUFFERS 32
+/* #define RE_USE_SAVE_BUFFERS_DEPRECATED 32 */
 #define RE_USE_SHADING_NODES_CUSTOM 64
 #define RE_USE_SPHERICAL_STEREO 128
 #define RE_USE_STEREO_VIEWPORT 256
@@ -75,6 +76,7 @@ extern "C" {
 #define RE_ENGINE_DO_UPDATE 8
 #define RE_ENGINE_RENDERING 16
 #define RE_ENGINE_HIGHLIGHT_TILES 32
+#define RE_ENGINE_CAN_DRAW 64
 
 extern ListBase R_engines;
 
@@ -87,7 +89,20 @@ typedef struct RenderEngineType {
   int flag;
 
   void (*update)(struct RenderEngine *engine, struct Main *bmain, struct Depsgraph *depsgraph);
+
   void (*render)(struct RenderEngine *engine, struct Depsgraph *depsgraph);
+
+  /* Offline rendering is finished - no more view layers will be rendered.
+   *
+   * All the pending data is to be communicated from the engine back to Blender. In a possibly
+   * most memory-efficient manner (engine might free its database before making Blender to allocate
+   * full-frame render result). */
+  void (*render_frame_finish)(struct RenderEngine *engine);
+
+  void (*draw)(struct RenderEngine *engine,
+               const struct bContext *context,
+               struct Depsgraph *depsgraph);
+
   void (*bake)(struct RenderEngine *engine,
                struct Depsgraph *depsgraph,
                struct Object *object,
@@ -132,9 +147,6 @@ typedef struct RenderEngine {
   struct Object *camera_override;
   unsigned int layer_override;
 
-  int tile_x;
-  int tile_y;
-
   struct Render *re;
   ListBase fullresult;
   char text[512]; /* IMA_MAX_RENDER_TEXT */
@@ -189,6 +201,10 @@ void RE_engine_end_result(RenderEngine *engine,
                           bool merge_results);
 struct RenderResult *RE_engine_get_result(struct RenderEngine *engine);
 
+struct RenderPass *RE_engine_pass_by_index_get(struct RenderEngine *engine,
+                                               const char *layer_name,
+                                               int index);
+
 const char *RE_engine_active_view_get(RenderEngine *engine);
 void RE_engine_active_view_set(RenderEngine *engine, const char *viewname);
 float RE_engine_get_camera_shift_x(RenderEngine *engine,
@@ -228,6 +244,24 @@ void RE_engine_register_pass(struct RenderEngine *engine,
 
 bool RE_engine_use_persistent_data(struct RenderEngine *engine);
 
+struct RenderEngine *RE_engine_get(const struct Render *re);
+
+/* Acquire render engine for drawing via its `draw()` callback.
+ *
+ * If drawing is not possible false is returned. If drawing is possible then the engine is
+ * "acquired" so that it can not be freed by the render pipeline.
+ *
+ * Drawing is possible if the engine has the `draw()` callback and it is in its `render()`
+ * callback. */
+bool RE_engine_draw_acquire(struct Render *re);
+void RE_engine_draw_release(struct Render *re);
+
+/* NOTE: Only used for Cycles's BLenderGPUDisplay integration with the draw manager. A subject
+ * for re-consideration. Do not use this functionality. */
+bool RE_engine_has_render_context(struct RenderEngine *engine);
+void RE_engine_render_context_enable(struct RenderEngine *engine);
+void RE_engine_render_context_disable(struct RenderEngine *engine);
+
 /* Engine Types */
 
 void RE_engines_init(void);
@@ -252,6 +286,10 @@ void RE_bake_engine_set_engine_parameters(struct Render *re,
 
 void RE_engine_free_blender_memory(struct RenderEngine *engine);
 
+void RE_engine_tile_highlight_set(
+    struct RenderEngine *engine, int x, int y, int width, int height, bool highlight);
+void RE_engine_tile_highlight_clear_all(struct RenderEngine *engine);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/render/RE_pipeline.h b/source/blender/render/RE_pipeline.h
index cd839385bfb..3237772dd80 100644
--- a/source/blender/render/RE_pipeline.h
+++ b/source/blender/render/RE_pipeline.h
@@ -141,9 +141,6 @@ typedef struct RenderResult {
   volatile rcti renrect;
   volatile RenderLayer *renlay;
 
-  /* optional saved endresult on disk */
-  int do_exr_tile;
-
   /* for render results in Image, verify validity for sequences */
   int framenr;
 
diff --git a/source/blender/render/intern/bake.c b/source/blender/render/intern/bake.c
index 76839651b5d..0f893ce8cd5 100644
--- a/source/blender/render/intern/bake.c
+++ b/source/blender/render/intern/bake.c
@@ -774,18 +774,6 @@ void RE_bake_pixels_populate(Mesh *me,
 
 /* ******************** NORMALS ************************ */
 
-/**
- * convert a normalized normal to the -1.0 1.0 range
- * the input is expected to be POS_X, POS_Y, POS_Z
- */
-static void normal_uncompress(float out[3], const float in[3])
-{
-  int i;
-  for (i = 0; i < 3; i++) {
-    out[i] = 2.0f * in[i] - 1.0f;
-  }
-}
-
 static void normal_compress(float out[3],
                             const float in[3],
                             const eBakeNormalSwizzle normal_swizzle[3])
@@ -934,7 +922,7 @@ void RE_bake_normal_world_to_tangent(const BakePixel pixel_array[],
     copy_v3_v3(tsm[2], normal);
 
     /* texture values */
-    normal_uncompress(nor, &result[offset]);
+    copy_v3_v3(nor, &result[offset]);
 
     /* converts from world space to local space */
     mul_transposed_mat3_m4_v3(mat, nor);
@@ -976,7 +964,7 @@ void RE_bake_normal_world_to_object(const BakePixel pixel_array[],
     }
 
     offset = i * depth;
-    normal_uncompress(nor, &result[offset]);
+    copy_v3_v3(nor, &result[offset]);
 
     /* rotates only without translation */
     mul_mat3_m4_v3(iobmat, nor);
@@ -1004,7 +992,7 @@ void RE_bake_normal_world_to_world(const BakePixel pixel_array[],
     }
 
     offset = i * depth;
-    normal_uncompress(nor, &result[offset]);
+    copy_v3_v3(nor, &result[offset]);
 
     /* save back the values */
     normal_compress(&result[offset], nor, normal_swizzle);
@@ -1053,6 +1041,7 @@ int RE_pass_depth(const eScenePassType pass_type)
     }
     case SCE_PASS_COMBINED:
     case SCE_PASS_SHADOW:
+    case SCE_PASS_POSITION:
     case SCE_PASS_NORMAL:
     case SCE_PASS_VECTOR:
     case SCE_PASS_INDEXOB: /* XXX double check */
diff --git a/source/blender/render/intern/engine.c b/source/blender/render/intern/engine.c
index 5728b784714..389b821ca35 100644
--- a/source/blender/render/intern/engine.c
+++ b/source/blender/render/intern/engine.c
@@ -62,7 +62,6 @@
 
 #include "DRW_engine.h"
 
-#include "initrender.h"
 #include "pipeline.h"
 #include "render_result.h"
 #include "render_types.h"
@@ -283,14 +282,6 @@ static void render_result_to_bake(RenderEngine *engine, RenderResult *rr)
 
 /* Render Results */
 
-static RenderPart *get_part_from_result(Render *re, RenderResult *result)
-{
-  rcti key = result->tilerect;
-  BLI_rcti_translate(&key, re->disprect.xmin, re->disprect.ymin);
-
-  return BLI_ghash_lookup(re->parts, &key);
-}
-
 static HighlightedTile highlighted_tile_from_result_get(Render *re, RenderResult *result)
 {
   HighlightedTile tile;
@@ -300,6 +291,37 @@ static HighlightedTile highlighted_tile_from_result_get(Render *re, RenderResult
   return tile;
 }
 
+static void engine_tile_highlight_set(RenderEngine *engine,
+                                      const HighlightedTile *tile,
+                                      bool highlight)
+{
+  if ((engine->flag & RE_ENGINE_HIGHLIGHT_TILES) == 0) {
+    return;
+  }
+
+  Render *re = engine->re;
+
+  BLI_mutex_lock(&re->highlighted_tiles_mutex);
+
+  if (re->highlighted_tiles == NULL) {
+    re->highlighted_tiles = BLI_gset_new(
+        BLI_ghashutil_inthash_v4_p, BLI_ghashutil_inthash_v4_cmp, "highlighted tiles");
+  }
+
+  if (highlight) {
+    HighlightedTile **tile_in_set;
+    if (!BLI_gset_ensure_p_ex(re->highlighted_tiles, tile, (void ***)&tile_in_set)) {
+      *tile_in_set = MEM_mallocN(sizeof(HighlightedTile), __func__);
+      **tile_in_set = *tile;
+    }
+  }
+  else {
+    BLI_gset_remove(re->highlighted_tiles, tile, MEM_freeN);
+  }
+
+  BLI_mutex_unlock(&re->highlighted_tiles_mutex);
+}
+
 RenderResult *RE_engine_begin_result(
     RenderEngine *engine, int x, int y, int w, int h, const char *layername, const char *viewname)
 {
@@ -332,7 +354,7 @@ RenderResult *RE_engine_begin_result(
   disprect.ymin = y;
   disprect.ymax = y + h;
 
-  result = render_result_new(re, &disprect, RR_USE_MEM, layername, viewname);
+  result = render_result_new(re, &disprect, layername, viewname);
 
   /* TODO: make this thread safe. */
 
@@ -341,25 +363,12 @@ RenderResult *RE_engine_begin_result(
     render_result_clone_passes(re, result, viewname);
     render_result_passes_allocated_ensure(result);
 
-    RenderPart *pa;
-
-    /* Copy EXR tile settings, so pipeline knows whether this is a result
-     * for Save Buffers enabled rendering.
-     */
-    result->do_exr_tile = re->result->do_exr_tile;
-
     BLI_addtail(&engine->fullresult, result);
 
     result->tilerect.xmin += re->disprect.xmin;
     result->tilerect.xmax += re->disprect.xmin;
     result->tilerect.ymin += re->disprect.ymin;
     result->tilerect.ymax += re->disprect.ymin;
-
-    pa = get_part_from_result(re, result);
-
-    if (pa) {
-      pa->status = PART_STATUS_IN_PROGRESS;
-    }
   }
 
   return result;
@@ -426,53 +435,14 @@ void RE_engine_end_result(
 
   re_ensure_passes_allocated_thread_safe(re);
 
-  /* merge. on break, don't merge in result for preview renders, looks nicer */
-  if (!highlight) {
-    /* for exr tile render, detect tiles that are done */
-    RenderPart *pa = get_part_from_result(re, result);
-
-    if (pa) {
-      pa->status = (!cancel && merge_results) ? PART_STATUS_MERGED : PART_STATUS_RENDERED;
-    }
-    else if (re->result->do_exr_tile) {
-      /* If written result does not match any tile and we are using save
-       * buffers, we are going to get OpenEXR save errors. */
-      fprintf(stderr, "RenderEngine.end_result: dimensions do not match any OpenEXR tile.\n");
-    }
-  }
-
   if (re->engine && (re->engine->flag & RE_ENGINE_HIGHLIGHT_TILES)) {
-    BLI_mutex_lock(&re->highlighted_tiles_mutex);
-
-    if (re->highlighted_tiles == NULL) {
-      re->highlighted_tiles = BLI_gset_new(
-          BLI_ghashutil_inthash_v4_p, BLI_ghashutil_inthash_v4_cmp, "highlighted tiles");
-    }
+    const HighlightedTile tile = highlighted_tile_from_result_get(re, result);
 
-    HighlightedTile tile = highlighted_tile_from_result_get(re, result);
-    if (highlight) {
-      void **tile_in_set;
-      if (!BLI_gset_ensure_p_ex(re->highlighted_tiles, &tile, &tile_in_set)) {
-        *tile_in_set = MEM_mallocN(sizeof(HighlightedTile), __func__);
-        memcpy(*tile_in_set, &tile, sizeof(tile));
-      }
-      BLI_gset_add(re->highlighted_tiles, &tile);
-    }
-    else {
-      BLI_gset_remove(re->highlighted_tiles, &tile, MEM_freeN);
-    }
-
-    BLI_mutex_unlock(&re->highlighted_tiles_mutex);
+    engine_tile_highlight_set(engine, &tile, highlight);
   }
 
   if (!cancel || merge_results) {
-    if (re->result->do_exr_tile) {
-      if (!cancel && merge_results) {
-        render_result_exr_file_merge(re->result, result, re->viewname);
-        render_result_merge(re->result, result);
-      }
-    }
-    else if (!(re->test_break(re->tbh) && (re->r.scemode & R_BUTS_PREVIEW))) {
+    if (!(re->test_break(re->tbh) && (re->r.scemode & R_BUTS_PREVIEW))) {
       render_result_merge(re->result, result);
     }
 
@@ -582,6 +552,27 @@ void RE_engine_set_error_message(RenderEngine *engine, const char *msg)
   }
 }
 
+RenderPass *RE_engine_pass_by_index_get(RenderEngine *engine, const char *layer_name, int index)
+{
+  Render *re = engine->re;
+  if (re == NULL) {
+    return NULL;
+  }
+
+  RenderPass *pass = NULL;
+
+  RenderResult *rr = RE_AcquireResultRead(re);
+  if (rr != NULL) {
+    const RenderLayer *layer = RE_GetRenderLayer(rr, layer_name);
+    if (layer != NULL) {
+      pass = BLI_findlink(&layer->passes, index);
+    }
+  }
+  RE_ReleaseResult(re);
+
+  return pass;
+}
+
 const char *RE_engine_active_view_get(RenderEngine *engine)
 {
   Render *re = engine->re;
@@ -837,12 +828,6 @@ bool RE_bake_engine(Render *re,
   engine->resolution_x = re->winx;
   engine->resolution_y = re->winy;
 
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-  RE_parts_init(re);
-  engine->tile_x = re->r.tilex;
-  engine->tile_y = re->r.tiley;
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
   if (type->bake) {
     engine->depsgraph = depsgraph;
 
@@ -870,21 +855,13 @@ bool RE_bake_engine(Render *re,
     engine->depsgraph = NULL;
   }
 
-  engine->tile_x = 0;
-  engine->tile_y = 0;
   engine->flag &= ~RE_ENGINE_RENDERING;
 
-  /* Free depsgraph outside of parts mutex lock, since this locks OpenGL context
-   * while the UI drawing might also lock the OpenGL context and parts mutex. */
   engine_depsgraph_free(engine);
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
 
   RE_engine_free(engine);
   re->engine = NULL;
 
-  RE_parts_free(re);
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
   if (BKE_reports_contain(re->reports, RPT_ERROR)) {
     G.is_break = true;
   }
@@ -928,15 +905,23 @@ static void engine_render_view_layer(Render *re,
       DRW_render_context_enable(engine->re);
     }
 
+    BLI_mutex_lock(&engine->re->engine_draw_mutex);
+    re->engine->flag |= RE_ENGINE_CAN_DRAW;
+    BLI_mutex_unlock(&engine->re->engine_draw_mutex);
+
     engine->type->render(engine, engine->depsgraph);
 
+    BLI_mutex_lock(&engine->re->engine_draw_mutex);
+    re->engine->flag &= ~RE_ENGINE_CAN_DRAW;
+    BLI_mutex_unlock(&engine->re->engine_draw_mutex);
+
     if (use_gpu_context) {
       DRW_render_context_disable(engine->re);
     }
   }
 
   /* Optionally composite grease pencil over render result. */
-  if (engine->has_grease_pencil && use_grease_pencil && !re->result->do_exr_tile) {
+  if (engine->has_grease_pencil && use_grease_pencil) {
     /* NOTE: External engine might have been requested to free its
      * dependency graph, which is only allowed if there is no grease
      * pencil (pipeline is taking care of that). */
@@ -981,16 +966,11 @@ bool RE_engine_render(Render *re, bool do_all)
   /* create render result */
   BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
   if (re->result == NULL || !(re->r.scemode & R_BUTS_PREVIEW)) {
-    int savebuffers = RR_USE_MEM;
-
     if (re->result) {
       render_result_free(re->result);
     }
 
-    if ((type->flag & RE_USE_SAVE_BUFFERS) && (re->r.scemode & R_EXR_TILE_FILE)) {
-      savebuffers = RR_USE_EXR;
-    }
-    re->result = render_result_new(re, &re->disprect, savebuffers, RR_ALL_LAYERS, RR_ALL_VIEWS);
+    re->result = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
   }
   BLI_rw_mutex_unlock(&re->resultmutex);
 
@@ -1035,32 +1015,15 @@ bool RE_engine_render(Render *re, bool do_all)
   engine->resolution_x = re->winx;
   engine->resolution_y = re->winy;
 
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-  RE_parts_init(re);
-  engine->tile_x = re->partx;
-  engine->tile_y = re->party;
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
-  if (re->result->do_exr_tile) {
-    render_result_exr_file_begin(re, engine);
-  }
-
   /* Clear UI drawing locks. */
   if (re->draw_lock) {
     re->draw_lock(re->dlh, false);
   }
 
-  /* Render view layers. */
-  bool delay_grease_pencil = false;
-
   if (type->render) {
     FOREACH_VIEW_LAYER_TO_RENDER_BEGIN (re, view_layer_iter) {
       engine_render_view_layer(re, engine, view_layer_iter, true, true);
 
-      /* With save buffers there is no render buffer in memory for compositing, delay
-       * grease pencil in that case. */
-      delay_grease_pencil = engine->has_grease_pencil && re->result->do_exr_tile;
-
       if (RE_engine_test_break(engine)) {
         break;
       }
@@ -1068,42 +1031,18 @@ bool RE_engine_render(Render *re, bool do_all)
     FOREACH_VIEW_LAYER_TO_RENDER_END;
   }
 
+  if (type->render_frame_finish) {
+    type->render_frame_finish(engine);
+  }
+
   /* Clear tile data */
-  engine->tile_x = 0;
-  engine->tile_y = 0;
   engine->flag &= ~RE_ENGINE_RENDERING;
 
   render_result_free_list(&engine->fullresult, engine->fullresult.first);
 
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-
-  /* For save buffers, read back from disk. */
-  if (re->result->do_exr_tile) {
-    render_result_exr_file_end(re, engine);
-  }
-
-  /* Perform delayed grease pencil rendering. */
-  if (delay_grease_pencil) {
-    BLI_rw_mutex_unlock(&re->partsmutex);
-
-    FOREACH_VIEW_LAYER_TO_RENDER_BEGIN (re, view_layer_iter) {
-      engine_render_view_layer(re, engine, view_layer_iter, false, true);
-      if (RE_engine_test_break(engine)) {
-        break;
-      }
-    }
-    FOREACH_VIEW_LAYER_TO_RENDER_END;
-
-    BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-  }
-
   /* re->engine becomes zero if user changed active render engine during render */
   if (!engine_keep_depsgraph(engine) || !re->engine) {
-    /* Free depsgraph outside of parts mutex lock, since this locks OpenGL context
-     * while the UI drawing might also lock the OpenGL context and parts mutex. */
-    BLI_rw_mutex_unlock(&re->partsmutex);
     engine_depsgraph_free(engine);
-    BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
 
     RE_engine_free(engine);
     re->engine = NULL;
@@ -1115,9 +1054,6 @@ bool RE_engine_render(Render *re, bool do_all)
     BLI_rw_mutex_unlock(&re->resultmutex);
   }
 
-  RE_parts_free(re);
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
   if (BKE_reports_contain(re->reports, RPT_ERROR)) {
     G.is_break = true;
   }
@@ -1179,3 +1115,81 @@ void RE_engine_free_blender_memory(RenderEngine *engine)
   }
   engine_depsgraph_free(engine);
 }
+
+struct RenderEngine *RE_engine_get(const Render *re)
+{
+  return re->engine;
+}
+
+bool RE_engine_draw_acquire(Render *re)
+{
+  BLI_mutex_lock(&re->engine_draw_mutex);
+
+  RenderEngine *engine = re->engine;
+
+  if (engine == NULL || engine->type->draw == NULL || (engine->flag & RE_ENGINE_CAN_DRAW) == 0) {
+    BLI_mutex_unlock(&re->engine_draw_mutex);
+    return false;
+  }
+
+  return true;
+}
+
+void RE_engine_draw_release(Render *re)
+{
+  BLI_mutex_unlock(&re->engine_draw_mutex);
+}
+
+void RE_engine_tile_highlight_set(
+    RenderEngine *engine, int x, int y, int width, int height, bool highlight)
+{
+  HighlightedTile tile;
+  BLI_rcti_init(&tile.rect, x, x + width, y, y + height);
+
+  engine_tile_highlight_set(engine, &tile, highlight);
+}
+
+void RE_engine_tile_highlight_clear_all(RenderEngine *engine)
+{
+  if ((engine->flag & RE_ENGINE_HIGHLIGHT_TILES) == 0) {
+    return;
+  }
+
+  Render *re = engine->re;
+
+  BLI_mutex_lock(&re->highlighted_tiles_mutex);
+
+  if (re->highlighted_tiles != NULL) {
+    BLI_gset_clear(re->highlighted_tiles, MEM_freeN);
+  }
+
+  BLI_mutex_unlock(&re->highlighted_tiles_mutex);
+}
+
+/* -------------------------------------------------------------------- */
+/** \name OpenGL context manipulation.
+ *
+ * NOTE: Only used for Cycles's BLenderGPUDisplay integration with the draw manager. A subject
+ * for re-consideration. Do not use this functionality.
+ * \{ */
+
+bool RE_engine_has_render_context(RenderEngine *engine)
+{
+  if (engine->re == NULL) {
+    return false;
+  }
+
+  return RE_gl_context_get(engine->re) != NULL;
+}
+
+void RE_engine_render_context_enable(RenderEngine *engine)
+{
+  DRW_render_context_enable(engine->re);
+}
+
+void RE_engine_render_context_disable(RenderEngine *engine)
+{
+  DRW_render_context_disable(engine->re);
+}
+
+/** \} */
diff --git a/source/blender/render/intern/initrender.c b/source/blender/render/intern/initrender.c
index 3148625c866..2370d8e893b 100644
--- a/source/blender/render/intern/initrender.c
+++ b/source/blender/render/intern/initrender.c
@@ -43,9 +43,6 @@
 #include "pipeline.h"
 #include "render_types.h"
 
-/* Own includes */
-#include "initrender.h"
-
 /* ****************** MASKS and LUTS **************** */
 
 static float filt_quadratic(float x)
@@ -244,91 +241,3 @@ void RE_GetViewPlane(Render *re, rctf *r_viewplane, rcti *r_disprect)
     BLI_rcti_init(r_disprect, 0, 0, 0, 0);
   }
 }
-
-/* ~~~~~~~~~~~~~~~~ part (tile) calculus ~~~~~~~~~~~~~~~~~~~~~~ */
-
-void RE_parts_free(Render *re)
-{
-  if (re->parts) {
-    BLI_ghash_free(re->parts, NULL, MEM_freeN);
-    re->parts = NULL;
-  }
-}
-
-void RE_parts_clamp(Render *re)
-{
-  /* part size */
-  re->partx = max_ii(1, min_ii(re->r.tilex, re->rectx));
-  re->party = max_ii(1, min_ii(re->r.tiley, re->recty));
-}
-
-void RE_parts_init(Render *re)
-{
-  int nr, xd, yd, partx, party, xparts, yparts;
-  int xminb, xmaxb, yminb, ymaxb;
-
-  RE_parts_free(re);
-
-  re->parts = BLI_ghash_new(
-      BLI_ghashutil_inthash_v4_p, BLI_ghashutil_inthash_v4_cmp, "render parts");
-
-  /* Just for readable code. */
-  xminb = re->disprect.xmin;
-  yminb = re->disprect.ymin;
-  xmaxb = re->disprect.xmax;
-  ymaxb = re->disprect.ymax;
-
-  RE_parts_clamp(re);
-
-  partx = re->partx;
-  party = re->party;
-  /* part count */
-  xparts = (re->rectx + partx - 1) / partx;
-  yparts = (re->recty + party - 1) / party;
-
-  for (nr = 0; nr < xparts * yparts; nr++) {
-    rcti disprect;
-    int rectx, recty;
-
-    xd = (nr % xparts);
-    yd = (nr - xd) / xparts;
-
-    disprect.xmin = xminb + xd * partx;
-    disprect.ymin = yminb + yd * party;
-
-    /* ensure we cover the entire picture, so last parts go to end */
-    if (xd < xparts - 1) {
-      disprect.xmax = disprect.xmin + partx;
-      if (disprect.xmax > xmaxb) {
-        disprect.xmax = xmaxb;
-      }
-    }
-    else {
-      disprect.xmax = xmaxb;
-    }
-
-    if (yd < yparts - 1) {
-      disprect.ymax = disprect.ymin + party;
-      if (disprect.ymax > ymaxb) {
-        disprect.ymax = ymaxb;
-      }
-    }
-    else {
-      disprect.ymax = ymaxb;
-    }
-
-    rectx = BLI_rcti_size_x(&disprect);
-    recty = BLI_rcti_size_y(&disprect);
-
-    /* so, now can we add this part? */
-    if (rectx > 0 && recty > 0) {
-      RenderPart *pa = MEM_callocN(sizeof(RenderPart), "new part");
-
-      pa->disprect = disprect;
-      pa->rectx = rectx;
-      pa->recty = recty;
-
-      BLI_ghash_insert(re->parts, &pa->disprect, pa);
-    }
-  }
-}
diff --git a/source/blender/render/intern/initrender.h b/source/blender/render/intern/initrender.h
deleted file mode 100644
index f5ac352752f..00000000000
--- a/source/blender/render/intern/initrender.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
- * All rights reserved.
- */
-
-/** \file
- * \ingroup render
- */
-
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Functions */
-
-void RE_parts_init(Render *re);
-void RE_parts_free(Render *re);
-void RE_parts_clamp(Render *re);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/source/blender/render/intern/pipeline.c b/source/blender/render/intern/pipeline.c
index 5418f4035b1..72ff920561d 100644
--- a/source/blender/render/intern/pipeline.c
+++ b/source/blender/render/intern/pipeline.c
@@ -102,7 +102,6 @@
 #include "DEG_depsgraph.h"
 
 /* internal */
-#include "initrender.h"
 #include "pipeline.h"
 #include "render_result.h"
 #include "render_types.h"
@@ -568,7 +567,7 @@ Render *RE_NewRender(const char *name)
     BLI_addtail(&RenderGlobal.renderlist, re);
     BLI_strncpy(re->name, name, RE_MAXNAME);
     BLI_rw_mutex_init(&re->resultmutex);
-    BLI_rw_mutex_init(&re->partsmutex);
+    BLI_mutex_init(&re->engine_draw_mutex);
     BLI_mutex_init(&re->highlighted_tiles_mutex);
   }
 
@@ -633,7 +632,7 @@ void RE_FreeRender(Render *re)
   }
 
   BLI_rw_mutex_end(&re->resultmutex);
-  BLI_rw_mutex_end(&re->partsmutex);
+  BLI_mutex_end(&re->engine_draw_mutex);
   BLI_mutex_end(&re->highlighted_tiles_mutex);
 
   BLI_freelistN(&re->view_layers);
@@ -722,26 +721,6 @@ void RE_FreePersistentData(const Scene *scene)
 
 /* ********* initialize state ******** */
 
-/* clear full sample and tile flags if needed */
-static int check_mode_full_sample(RenderData *rd)
-{
-  int scemode = rd->scemode;
-
-  /* not supported by any current renderer */
-  scemode &= ~R_FULL_SAMPLE;
-
-#ifdef WITH_OPENEXR
-  if (scemode & R_FULL_SAMPLE) {
-    scemode |= R_EXR_TILE_FILE; /* enable automatic */
-  }
-#else
-  /* can't do this without openexr support */
-  scemode &= ~(R_EXR_TILE_FILE | R_FULL_SAMPLE);
-#endif
-
-  return scemode;
-}
-
 static void re_init_resolution(Render *re, Render *source, int winx, int winy, rcti *disprect)
 {
   re->winx = winx;
@@ -839,8 +818,6 @@ void RE_InitState(Render *re,
     return;
   }
 
-  re->r.scemode = check_mode_full_sample(&re->r);
-
   if (single_layer) {
     int index = BLI_findindex(render_layers, single_layer);
     if (index != -1) {
@@ -890,9 +867,6 @@ void RE_InitState(Render *re,
     render_result_view_new(re->result, "");
   }
 
-  /* ensure renderdatabase can use part settings correct */
-  RE_parts_clamp(re);
-
   BLI_rw_mutex_unlock(&re->resultmutex);
 
   RE_init_threadcount(re);
@@ -1040,7 +1014,7 @@ static void render_result_uncrop(Render *re)
       /* weak is: it chances disprect from border */
       render_result_disprect_to_full_resolution(re);
 
-      rres = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
+      rres = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
       render_result_passes_allocated_ensure(rres);
       rres->stamp_data = BKE_stamp_data_copy(re->result->stamp_data);
 
@@ -1227,7 +1201,7 @@ static void do_render_compositor(Render *re)
     if ((re->r.mode & R_CROP) == 0) {
       render_result_disprect_to_full_resolution(re);
     }
-    re->result = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
+    re->result = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
 
     BLI_rw_mutex_unlock(&re->resultmutex);
 
@@ -1647,7 +1621,7 @@ bool RE_is_rendering_allowed(Scene *scene,
                              Object *camera_override,
                              ReportList *reports)
 {
-  int scemode = check_mode_full_sample(&scene->r);
+  const int scemode = scene->r.scemode;
 
   if (scene->r.mode & R_BORDER) {
     if (scene->r.border.xmax <= scene->r.border.xmin ||
@@ -1657,17 +1631,6 @@ bool RE_is_rendering_allowed(Scene *scene,
     }
   }
 
-  if (scemode & (R_EXR_TILE_FILE | R_FULL_SAMPLE)) {
-    char str[FILE_MAX];
-
-    render_result_exr_file_path(scene, "", 0, str);
-
-    if (!BLI_file_is_writable(str)) {
-      BKE_report(reports, RPT_ERROR, "Cannot save render buffers, check the temp default path");
-      return 0;
-    }
-  }
-
   if (RE_seq_render_active(scene, &scene->r)) {
     /* Sequencer */
     if (scene->r.mode & R_BORDER) {
@@ -1686,13 +1649,6 @@ bool RE_is_rendering_allowed(Scene *scene,
       BKE_report(reports, RPT_ERROR, "No render output node in scene");
       return 0;
     }
-
-    if (scemode & R_FULL_SAMPLE) {
-      if (compositor_needs_render(scene, 0) == 0) {
-        BKE_report(reports, RPT_ERROR, "Full sample AA not supported without 3D rendering");
-        return 0;
-      }
-    }
   }
   else {
     /* Regular Render */
@@ -1710,14 +1666,6 @@ bool RE_is_rendering_allowed(Scene *scene,
   return 1;
 }
 
-static void validate_render_settings(Render *re)
-{
-  if (RE_engine_is_external(re)) {
-    /* not supported yet */
-    re->r.scemode &= ~R_FULL_SAMPLE;
-  }
-}
-
 static void update_physics_cache(Render *re,
                                  Scene *scene,
                                  ViewLayer *view_layer,
@@ -1820,8 +1768,6 @@ static int render_init_from_main(Render *re,
   /* initstate makes new result, have to send changed tags around */
   ntreeCompositTagRender(re->scene);
 
-  validate_render_settings(re);
-
   re->display_init(re->dih, re->result);
   re->display_clear(re->dch, re->result);
 
diff --git a/source/blender/render/intern/render_result.c b/source/blender/render/intern/render_result.c
index 6cb6aabe885..c308147fc5b 100644
--- a/source/blender/render/intern/render_result.c
+++ b/source/blender/render/intern/render_result.c
@@ -260,8 +260,10 @@ RenderPass *render_layer_add_pass(RenderResult *rr,
 /* will read info from Render *re to define layers */
 /* called in threads */
 /* re->winx,winy is coordinate space of entire image, partrct the part within */
-RenderResult *render_result_new(
-    Render *re, rcti *partrct, int savebuffers, const char *layername, const char *viewname)
+RenderResult *render_result_new(Render *re,
+                                rcti *partrct,
+                                const char *layername,
+                                const char *viewname)
 {
   RenderResult *rr;
   RenderLayer *rl;
@@ -287,10 +289,6 @@ RenderResult *render_result_new(
   rr->tilerect.ymin = partrct->ymin - re->disprect.ymin;
   rr->tilerect.ymax = partrct->ymax - re->disprect.ymin;
 
-  if (savebuffers) {
-    rr->do_exr_tile = true;
-  }
-
   rr->passes_allocated = false;
 
   render_result_views_new(rr, &re->r);
@@ -314,10 +312,6 @@ RenderResult *render_result_new(
     rl->rectx = rectx;
     rl->recty = recty;
 
-    if (rr->do_exr_tile) {
-      rl->exrhandle = IMB_exr_get_handle();
-    }
-
     for (rv = rr->views.first; rv; rv = rv->next) {
       const char *view = rv->name;
 
@@ -327,10 +321,6 @@ RenderResult *render_result_new(
         }
       }
 
-      if (rr->do_exr_tile) {
-        IMB_exr_add_view(rl->exrhandle, view);
-      }
-
 #define RENDER_LAYER_ADD_PASS_SAFE(rr, rl, channels, name, viewname, chan_id) \
   do { \
     if (render_layer_add_pass(rr, rl, channels, name, viewname, chan_id) == NULL) { \
@@ -351,6 +341,9 @@ RenderResult *render_result_new(
       if (view_layer->passflag & SCE_PASS_NORMAL) {
         RENDER_LAYER_ADD_PASS_SAFE(rr, rl, 3, RE_PASSNAME_NORMAL, view, "XYZ");
       }
+      if (view_layer->passflag & SCE_PASS_POSITION) {
+        RENDER_LAYER_ADD_PASS_SAFE(rr, rl, 3, RE_PASSNAME_POSITION, view, "XYZ");
+      }
       if (view_layer->passflag & SCE_PASS_UV) {
         RENDER_LAYER_ADD_PASS_SAFE(rr, rl, 3, RE_PASSNAME_UV, view, "UVA");
       }
@@ -424,11 +417,6 @@ RenderResult *render_result_new(
     rl->rectx = rectx;
     rl->recty = recty;
 
-    /* duplicate code... */
-    if (rr->do_exr_tile) {
-      rl->exrhandle = IMB_exr_get_handle();
-    }
-
     for (rv = rr->views.first; rv; rv = rv->next) {
       const char *view = rv->name;
 
@@ -438,10 +426,6 @@ RenderResult *render_result_new(
         }
       }
 
-      if (rr->do_exr_tile) {
-        IMB_exr_add_view(rl->exrhandle, view);
-      }
-
       /* a renderlayer should always have a Combined pass */
       render_layer_add_pass(rr, rl, 4, RE_PASSNAME_COMBINED, view, "RGBA");
     }
@@ -1089,227 +1073,6 @@ void render_result_single_layer_end(Render *re)
   re->pushedresult = NULL;
 }
 
-/************************* EXR Tile File Rendering ***************************/
-
-static void save_render_result_tile(RenderResult *rr, RenderResult *rrpart, const char *viewname)
-{
-  RenderLayer *rlp, *rl;
-  RenderPass *rpassp;
-  int partx, party;
-
-  BLI_thread_lock(LOCK_IMAGE);
-
-  for (rlp = rrpart->layers.first; rlp; rlp = rlp->next) {
-    rl = RE_GetRenderLayer(rr, rlp->name);
-
-    /* should never happen but prevents crash if it does */
-    BLI_assert(rl);
-    if (UNLIKELY(rl == NULL)) {
-      continue;
-    }
-
-    /* passes are allocated in sync */
-    for (rpassp = rlp->passes.first; rpassp; rpassp = rpassp->next) {
-      const int xstride = rpassp->channels;
-      int a;
-      char fullname[EXR_PASS_MAXNAME];
-
-      for (a = 0; a < xstride; a++) {
-        set_pass_full_name(fullname, rpassp->name, a, viewname, rpassp->chan_id);
-
-        IMB_exr_set_channel(rl->exrhandle,
-                            rlp->name,
-                            fullname,
-                            xstride,
-                            xstride * rrpart->rectx,
-                            rpassp->rect + a);
-      }
-    }
-  }
-
-  party = rrpart->tilerect.ymin;
-  partx = rrpart->tilerect.xmin;
-
-  for (rlp = rrpart->layers.first; rlp; rlp = rlp->next) {
-    rl = RE_GetRenderLayer(rr, rlp->name);
-
-    /* should never happen but prevents crash if it does */
-    BLI_assert(rl);
-    if (UNLIKELY(rl == NULL)) {
-      continue;
-    }
-
-    IMB_exrtile_write_channels(rl->exrhandle, partx, party, 0, viewname, false);
-  }
-
-  BLI_thread_unlock(LOCK_IMAGE);
-}
-
-void render_result_save_empty_result_tiles(Render *re)
-{
-  RenderResult *rr;
-  RenderLayer *rl;
-
-  for (rr = re->result; rr; rr = rr->next) {
-    for (rl = rr->layers.first; rl; rl = rl->next) {
-      GHashIterator pa_iter;
-      GHASH_ITER (pa_iter, re->parts) {
-        RenderPart *pa = BLI_ghashIterator_getValue(&pa_iter);
-        if (pa->status != PART_STATUS_MERGED) {
-          int party = pa->disprect.ymin - re->disprect.ymin;
-          int partx = pa->disprect.xmin - re->disprect.xmin;
-          IMB_exrtile_write_channels(rl->exrhandle, partx, party, 0, re->viewname, true);
-        }
-      }
-    }
-  }
-}
-
-/* Compute list of passes needed by render engine. */
-static void templates_register_pass_cb(void *userdata,
-                                       Scene *UNUSED(scene),
-                                       ViewLayer *UNUSED(view_layer),
-                                       const char *name,
-                                       int channels,
-                                       const char *chan_id,
-                                       eNodeSocketDatatype UNUSED(type))
-{
-  ListBase *templates = userdata;
-  RenderPass *pass = MEM_callocN(sizeof(RenderPass), "RenderPassTemplate");
-
-  pass->channels = channels;
-  BLI_strncpy(pass->name, name, sizeof(pass->name));
-  BLI_strncpy(pass->chan_id, chan_id, sizeof(pass->chan_id));
-
-  BLI_addtail(templates, pass);
-}
-
-static void render_result_get_pass_templates(RenderEngine *engine,
-                                             Render *re,
-                                             RenderLayer *rl,
-                                             ListBase *templates)
-{
-  BLI_listbase_clear(templates);
-
-  if (engine && engine->type->update_render_passes) {
-    ViewLayer *view_layer = BLI_findstring(&re->view_layers, rl->name, offsetof(ViewLayer, name));
-    if (view_layer) {
-      RE_engine_update_render_passes(
-          engine, re->scene, view_layer, templates_register_pass_cb, templates);
-    }
-  }
-}
-
-/* begin write of exr tile file */
-void render_result_exr_file_begin(Render *re, RenderEngine *engine)
-{
-  char str[FILE_MAX];
-
-  for (RenderResult *rr = re->result; rr; rr = rr->next) {
-    LISTBASE_FOREACH (RenderLayer *, rl, &rr->layers) {
-      /* Get passes needed by engine. Normally we would wait for the
-       * engine to create them, but for EXR file we need to know in
-       * advance. */
-      ListBase templates;
-      render_result_get_pass_templates(engine, re, rl, &templates);
-
-      /* Create render passes requested by engine. Only this part is
-       * mutex locked to avoid deadlock with Python GIL. */
-      BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
-      LISTBASE_FOREACH (RenderPass *, pass, &templates) {
-        RE_create_render_pass(
-            re->result, pass->name, pass->channels, pass->chan_id, rl->name, NULL);
-      }
-      BLI_rw_mutex_unlock(&re->resultmutex);
-
-      BLI_freelistN(&templates);
-
-      /* Open EXR file for writing. */
-      render_result_exr_file_path(re->scene, rl->name, rr->sample_nr, str);
-      printf("write exr tmp file, %dx%d, %s\n", rr->rectx, rr->recty, str);
-      IMB_exrtile_begin_write(rl->exrhandle, str, 0, rr->rectx, rr->recty, re->partx, re->party);
-    }
-  }
-}
-
-/* end write of exr tile file, read back first sample */
-void render_result_exr_file_end(Render *re, RenderEngine *engine)
-{
-  /* Preserve stamp data. */
-  struct StampData *stamp_data = re->result->stamp_data;
-  re->result->stamp_data = NULL;
-
-  /* Close EXR files. */
-  for (RenderResult *rr = re->result; rr; rr = rr->next) {
-    LISTBASE_FOREACH (RenderLayer *, rl, &rr->layers) {
-      IMB_exr_close(rl->exrhandle);
-      rl->exrhandle = NULL;
-    }
-
-    rr->do_exr_tile = false;
-  }
-
-  /* Create new render result in memory instead of on disk. */
-  BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
-  render_result_free_list(&re->fullresult, re->result);
-  re->result = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
-  re->result->stamp_data = stamp_data;
-  render_result_passes_allocated_ensure(re->result);
-  BLI_rw_mutex_unlock(&re->resultmutex);
-
-  LISTBASE_FOREACH (RenderLayer *, rl, &re->result->layers) {
-    /* Get passes needed by engine. */
-    ListBase templates;
-    render_result_get_pass_templates(engine, re, rl, &templates);
-
-    /* Create render passes requested by engine. Only this part is
-     * mutex locked to avoid deadlock with Python GIL. */
-    BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
-    LISTBASE_FOREACH (RenderPass *, pass, &templates) {
-      RE_create_render_pass(re->result, pass->name, pass->channels, pass->chan_id, rl->name, NULL);
-    }
-
-    BLI_freelistN(&templates);
-
-    /* Render passes contents from file. */
-    char str[FILE_MAXFILE + MAX_ID_NAME + MAX_ID_NAME + 100] = "";
-    render_result_exr_file_path(re->scene, rl->name, 0, str);
-    printf("read exr tmp file: %s\n", str);
-
-    if (!render_result_exr_file_read_path(re->result, rl, str)) {
-      printf("cannot read: %s\n", str);
-    }
-    BLI_rw_mutex_unlock(&re->resultmutex);
-  }
-}
-
-/* save part into exr file */
-void render_result_exr_file_merge(RenderResult *rr, RenderResult *rrpart, const char *viewname)
-{
-  for (; rr && rrpart; rr = rr->next, rrpart = rrpart->next) {
-    save_render_result_tile(rr, rrpart, viewname);
-  }
-}
-
-/* path to temporary exr file */
-void render_result_exr_file_path(Scene *scene, const char *layname, int sample, char *filepath)
-{
-  char name[FILE_MAXFILE + MAX_ID_NAME + MAX_ID_NAME + 100];
-  const char *fi = BLI_path_basename(BKE_main_blendfile_path_from_global());
-
-  if (sample == 0) {
-    BLI_snprintf(name, sizeof(name), "%s_%s_%s.exr", fi, scene->id.name + 2, layname);
-  }
-  else {
-    BLI_snprintf(name, sizeof(name), "%s_%s_%s%d.exr", fi, scene->id.name + 2, layname, sample);
-  }
-
-  /* Make name safe for paths, see T43275. */
-  BLI_filename_make_safe(name);
-
-  BLI_join_dirfile(filepath, FILE_MAX, BKE_tempdir_session(), name);
-}
-
 /* called for reading temp files, and for external engines */
 int render_result_exr_file_read_path(RenderResult *rr,
                                      RenderLayer *rl_single,
@@ -1416,7 +1179,7 @@ bool render_result_exr_file_cache_read(Render *re)
   char *root = U.render_cachedir;
 
   RE_FreeRenderResult(re->result);
-  re->result = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
+  re->result = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
 
   /* First try cache. */
   render_result_exr_file_cache_path(re->scene, root, str);
diff --git a/source/blender/render/intern/render_result.h b/source/blender/render/intern/render_result.h
index 1fc64a4ea97..4145bb3b8ab 100644
--- a/source/blender/render/intern/render_result.h
+++ b/source/blender/render/intern/render_result.h
@@ -25,9 +25,6 @@
 
 #define PASS_VECTOR_MAX 10000.0f
 
-#define RR_USE_MEM 0
-#define RR_USE_EXR 1
-
 #define RR_ALL_LAYERS NULL
 #define RR_ALL_VIEWS NULL
 
@@ -51,7 +48,6 @@ extern "C" {
 
 struct RenderResult *render_result_new(struct Render *re,
                                        struct rcti *partrct,
-                                       int savebuffers,
                                        const char *layername,
                                        const char *viewname);
 
@@ -81,12 +77,6 @@ void render_result_free_list(struct ListBase *lb, struct RenderResult *rr);
 void render_result_single_layer_begin(struct Render *re);
 void render_result_single_layer_end(struct Render *re);
 
-/* EXR Tile File Render */
-
-void render_result_save_empty_result_tiles(struct Render *re);
-void render_result_exr_file_begin(struct Render *re, struct RenderEngine *engine);
-void render_result_exr_file_end(struct Render *re, struct RenderEngine *engine);
-
 /* render pass wrapper for gpencil */
 struct RenderPass *render_layer_add_pass(struct RenderResult *rr,
                                          struct RenderLayer *rl,
@@ -95,14 +85,6 @@ struct RenderPass *render_layer_add_pass(struct RenderResult *rr,
                                          const char *viewname,
                                          const char *chan_id);
 
-void render_result_exr_file_merge(struct RenderResult *rr,
-                                  struct RenderResult *rrpart,
-                                  const char *viewname);
-
-void render_result_exr_file_path(struct Scene *scene,
-                                 const char *layname,
-                                 int sample,
-                                 char *filepath);
 int render_result_exr_file_read_path(struct RenderResult *rr,
                                      struct RenderLayer *rl_single,
                                      const char *filepath);
diff --git a/source/blender/render/intern/render_types.h b/source/blender/render/intern/render_types.h
index d2d2b499495..ca4f72350e1 100644
--- a/source/blender/render/intern/render_types.h
+++ b/source/blender/render/intern/render_types.h
@@ -47,30 +47,10 @@ struct ReportList;
 extern "C" {
 #endif
 
-/* this is handed over to threaded hiding/passes/shading engine */
-typedef struct RenderPart {
-  struct RenderPart *next, *prev;
-
-  RenderResult *result; /* result of part rendering */
-  ListBase fullresult;  /* optional full sample buffers */
-
-  rcti disprect;    /* part coordinates within total picture */
-  int rectx, recty; /* the size */
-  int nr;           /* nr is partnr */
-  short status;
-} RenderPart;
-
 typedef struct HighlightedTile {
   rcti rect;
 } HighlightedTile;
 
-enum {
-  /* PART_STATUS_NONE = 0, */ /* UNUSED */
-  PART_STATUS_IN_PROGRESS = 1,
-  PART_STATUS_RENDERED = 2,
-  PART_STATUS_MERGED = 3,
-};
-
 /* controls state of render, everything that's read-only during render stage */
 struct Render {
   struct Render *next, *prev;
@@ -91,6 +71,9 @@ struct Render {
    * to not conflict with writes, so no lock used for that */
   ThreadRWMutex resultmutex;
 
+  /* Guard for drawing render result using engine's `draw()` callback. */
+  ThreadMutex engine_draw_mutex;
+
   /** Window size, display rect, viewplane.
    * \note Buffer width and height with percentage applied
    * without border & crop. convert to long before multiplying together to avoid overflow. */
@@ -101,10 +84,6 @@ struct Render {
   /* final picture width and height (within disprect) */
   int rectx, recty;
 
-  /* real maximum size of parts after correction for minimum
-   * partx*xparts can be larger than rectx, in that case last part is smaller */
-  int partx, party;
-
   /* Camera transform, only used by Freestyle. */
   float winmat[4][4];
 
@@ -120,9 +99,6 @@ struct Render {
   int active_view_layer;
   struct Object *camera_override;
 
-  ThreadRWMutex partsmutex;
-  struct GHash *parts;
-
   ThreadMutex highlighted_tiles_mutex;
   struct GSet *highlighted_tiles;
 
diff --git a/source/blender/windowmanager/intern/wm_window.c b/source/blender/windowmanager/intern/wm_window.c
index 887aed7ffc7..8baf4a0e013 100644
--- a/source/blender/windowmanager/intern/wm_window.c
+++ b/source/blender/windowmanager/intern/wm_window.c
@@ -2426,10 +2426,15 @@ void wm_window_IME_end(wmWindow *win)
 
 void *WM_opengl_context_create(void)
 {
-  /* On Windows there is a problem creating contexts that share lists
-   * from one context that is current in another thread.
-   * So we should call this function only on the main thread.
-   */
+  /* On Windows there is a problem creating contexts that share resources (almost any object,
+   * including legacy display lists, but also textures) with a context which is current in another
+   * thread. This is a documented and behavior of both `::wglCreateContextAttribsARB()` and
+   * `::wglShareLists()`.
+   *
+   * Other platforms might successfully share resources from context which is active somewhere
+   * else, but to keep our code behave the same on all platform we expect contexts to only be
+   * created from the main thread. */
+
   BLI_assert(BLI_thread_is_main());
   BLI_assert(GPU_framebuffer_active_get() == GPU_framebuffer_back_get());
 
diff --git a/tests/performance/tests/cycles.py b/tests/performance/tests/cycles.py
index bac6b8a7ceb..e702fa445d2 100644
--- a/tests/performance/tests/cycles.py
+++ b/tests/performance/tests/cycles.py
@@ -17,6 +17,16 @@ def _run(args):
     scene.render.image_settings.file_format = 'PNG'
     scene.cycles.device = 'CPU' if device_type == 'CPU' else 'GPU'
 
+    if scene.cycles.use_adaptive_sampling:
+        # Render samples specified in file, no other way to measure
+        # adaptive sampling performance reliably.
+        scene.cycles.time_limit = 0.0
+    else:
+        # Render for fixed amount of time so it's adaptive to the
+        # machine and devices.
+        scene.cycles.samples = 16384
+        scene.cycles.time_limit = 10.0
+
     if scene.cycles.device == 'GPU':
         # Enable specified GPU in preferences.
         prefs = bpy.context.preferences
@@ -62,12 +72,14 @@ class CyclesTest(api.Test):
                 'device_index': device_index,
                 'render_filepath': str(env.log_file.parent / (env.log_file.stem + '.png'))}
 
-        _, lines = env.run_in_blender(_run, args, ['--debug-cycles', '--verbose', '1', self.filepath])
+        _, lines = env.run_in_blender(_run, args, ['--debug-cycles', '--verbose', '2', self.filepath])
 
         # Parse render time from output
         prefix_time = "Render time (without synchronization): "
         prefix_memory = "Peak: "
+        prefix_time_per_sample = "Average time per sample: "
         time = None
+        time_per_sample = None
         memory = None
         for line in lines:
             line = line.strip()
@@ -75,12 +87,20 @@ class CyclesTest(api.Test):
             if offset != -1:
                 time = line[offset + len(prefix_time):]
                 time = float(time)
+            offset = line.find(prefix_time_per_sample)
+            if offset != -1:
+                time_per_sample = line[offset + len(prefix_time_per_sample):]
+                time_per_sample = time_per_sample.split()[0]
+                time_per_sample = float(time_per_sample)
             offset = line.find(prefix_memory)
             if offset != -1:
                 memory = line[offset + len(prefix_memory):]
                 memory = memory.split()[0].replace(',', '')
                 memory = float(memory)
 
+        if time_per_sample:
+            time = time_per_sample
+
         if not (time and memory):
             raise Exception("Error parsing render time output")
 
@@ -88,5 +108,5 @@ class CyclesTest(api.Test):
 
 
 def generate(env):
-    filepaths = env.find_blend_files('cycles-x/*')
+    filepaths = env.find_blend_files('cycles/*')
     return [CyclesTest(filepath) for filepath in filepaths]
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index a1b94abc317..75f00c3c5cc 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -637,7 +637,6 @@ if(WITH_CYCLES OR WITH_OPENGL_RENDER_TESTS)
     set(render_tests
       bsdf
       denoise
-      denoise_animation
       displacement
       hair
       image_colorspace
author	Brecht Van Lommel <brecht@blender.org>	2021-09-20 18:59:20 +0300
committer	Brecht Van Lommel <brecht@blender.org>	2021-09-21 15:55:54 +0300
commit	08031197250aeecbaca3803254e6f25b8c7b7b37 (patch)
tree	6fe7ab045f0dc0a423d6557c4073f34309ef4740
parent	fa6b1007bad065440950cd67deb16a04f368856f (diff)